From e9b6de3a1256e5415a885c53140882679e0d159a Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Mon, 17 Feb 2025 16:47:54 +0800
Subject: [PATCH 1/5] =?UTF-8?q?issue/52:=20=E4=BF=AE=E6=94=B9=E6=A0=BC?=
 =?UTF-8?q?=E5=BC=8F=E9=85=8D=E7=BD=AE=EF=BC=8C=E6=B7=BB=E5=8A=A0=E6=A0=BC?=
 =?UTF-8?q?=E5=BC=8F=E5=8C=96=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 .clang-format     |  70 ++++------------
 scripts/format.py | 204 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 221 insertions(+), 53 deletions(-)
 create mode 100644 scripts/format.py

diff --git a/.clang-format b/.clang-format
index 66c6e431e..c05c7633b 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,21 +1,14 @@
-﻿# Generated from CLion C/C++ Code Style settings
+﻿---
 BasedOnStyle: LLVM
-AccessModifierOffset: -4
-AlignAfterOpenBracket: Align
-# AlignConsecutiveAssignments: None
-AlignOperands: Align
-AllowAllArgumentsOnNextLine: false
-AllowAllConstructorInitializersOnNextLine: false
-AllowAllParametersOfDeclarationOnNextLine: false
-AllowShortBlocksOnASingleLine: Always
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: Always
-AllowShortLambdasOnASingleLine: All
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterReturnType: None
-AlwaysBreakTemplateDeclarations: No
-BreakBeforeBraces: Custom
+IndentWidth: 4                        # 缩进宽度，LLVM 默认值为 2，改为 4
+AccessModifierOffset: -4              # public/protected/private 访问控制符相对成员的偏移，与 IndentWidth 配合，LLVM 默认值为 -2
+AlignOperands: AlignAfterOperator     # 双目运算符的行间对齐，LLVM 默认值为 Align，改为带符号一起换行
+BreakBeforeBinaryOperators: All       # 在双目运算符之前换行，LLVM 默认值为 None，改为换行时总是把双目运算符放在行首，包括赋值（=）
+ColumnLimit: 0                        # 列宽限制，LLVM 默认值为 80，改为不限制
+AllowShortBlocksOnASingleLine: Always # 是否允许短块（单个语句的块）不换行，LLVM 默认值为 Never，改为允许
+AllowShortLoopsOnASingleLine: true    # 是否允许短循环不换行，LLVM 默认值为 false，改为允许
+InsertBraces: true                    # 是否在 if/for/while/switch 等语句后插入大括号，LLVM 默认值为 false，改为允许
+BreakBeforeBraces: Custom             # 大括号换行配置，LLVM 默认值为 LLVM，改为自定义以使 BraceWrapping 生效
 BraceWrapping:
   AfterCaseLabel: false
   AfterClass: false
@@ -23,44 +16,15 @@ BraceWrapping:
   AfterEnum: false
   AfterFunction: false
   AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
   AfterUnion: false
+  AfterExternBlock: false
   BeforeCatch: false
   BeforeElse: false
+  BeforeLambdaBody: false
+  BeforeWhile: false
   IndentBraces: false
-  SplitEmptyFunction: false
+  SplitEmptyFunction: true
   SplitEmptyRecord: true
-BreakBeforeBinaryOperators: None
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializers: BeforeColon
-BreakInheritanceList: BeforeColon
-ColumnLimit: 0
-CompactNamespaces: true
-ContinuationIndentWidth: 4
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth: 4
-KeepEmptyLinesAtTheStartOfBlocks: true
-MaxEmptyLinesToKeep: 2
-NamespaceIndentation: All
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PointerAlignment: Right
-ReflowComments: false
-SpaceAfterCStyleCast: true
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: false
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 0
-SpacesInAngles: false
-SpacesInCStyleCastParentheses: false
-SpacesInContainerLiterals: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-TabWidth: 4
-UseTab: Never
+  SplitEmptyNamespace: true
diff --git a/scripts/format.py b/scripts/format.py
new file mode 100644
index 000000000..23969259d
--- /dev/null
+++ b/scripts/format.py
@@ -0,0 +1,204 @@
+import argparse
+import subprocess
+import os
+from pathlib import Path
+from colorama import Fore, Style
+
+# 支持的文件类型
+SUPPORTED_FILES = {
+    ".h": "c",
+    ".hh": "c",
+    ".hpp": "c",
+    ".c": "c",
+    ".cc": "c",
+    ".cpp": "c",
+    ".cxx": "c",
+    ".cu": "c",
+    ".cuh": "c",
+    ".mlu": "c",
+    ".cl": "c",
+    ".py": "py",
+}
+
+
+def format_file(file: Path, check: bool, formatter) -> bool:
+    formatter = formatter.get(SUPPORTED_FILES.get(file.suffix, None), None)
+    if not formatter:
+        return True  # 文件类型不支持，跳过
+
+    try:
+        cmd = []
+        if formatter.startswith("clang-format"):
+            cmd = [formatter, "-style=file", "-i", file]
+            if check:
+                cmd.insert(2, "-dry-run")
+                process = subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+                if process.stderr:
+                    print(f"{Fore.YELLOW}{file} is not formatted.{Style.RESET_ALL}")
+                    print(
+                        f"Use {Fore.CYAN}{formatter} -style=file -i {file}{Style.RESET_ALL} to format it."
+                    )
+                    return False
+            else:
+                subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+                print(f"{Fore.CYAN}Formatted: {file}{Style.RESET_ALL}")
+        elif formatter == "black":
+            cmd = [formatter, file]
+            if check:
+                cmd.insert(1, "--check")
+                process = subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+                if process.stderr:
+                    print(f"{Fore.YELLOW}{file} is not formatted.{Style.RESET_ALL}")
+                    print(
+                        f"Use {Fore.CYAN}{formatter} {file}{Style.RESET_ALL} to format it."
+                    )
+                    return False
+            else:
+                subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+                print(f"{Fore.CYAN}Formatted: {file}{Style.RESET_ALL}")
+    except FileNotFoundError:
+        print(
+            f"{Fore.RED}Formatter {formatter} not found, {file} skipped.{Style.RESET_ALL}"
+        )
+    except subprocess.CalledProcessError as e:
+        print(f"{Fore.RED}Formatter {formatter} failed: {e}{Style.RESET_ALL}")
+
+    return True
+
+
+def git_added_files():
+    """获取所有已暂存更改的文件"""
+    try:
+        # 使用 git diff --cached --name-only 获取所有已添加到暂存区的文件
+        result = subprocess.run(
+            ["git", "diff", "--cached", "--name-only"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        for file in result.stdout.splitlines():
+            yield Path(file.strip())
+    except subprocess.CalledProcessError as e:
+        print(f"{Fore.RED}Git diff failed: {e}{Style.RESET_ALL}")
+
+
+def git_modified_since_ref(ref):
+    """获取从指定的 Git 引用到当前状态的修改文件列表"""
+    try:
+        result = subprocess.run(
+            ["git", "diff", f"{ref}..", "--diff-filter=AMR", "--name-only"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        for file in result.stdout.splitlines():
+            yield Path(file.strip())
+    except subprocess.CalledProcessError as e:
+        print(f"{Fore.RED}Git diff failed: {e}{Style.RESET_ALL}")
+
+
+def list_files(paths):
+    """递归获取指定路径下的所有文件"""
+    files = []
+    for path in paths:
+        if path.is_file():
+            yield path
+        elif path.is_dir():
+            for dirpath, _, filenames in os.walk(path):
+                for name in filenames:
+                    yield Path(dirpath) / name
+        else:
+            print(
+                f"{Fore.RED}Error: {path} is not a file or directory.{Style.RESET_ALL}"
+            )
+
+
+def filter_in_path(file: Path, path) -> bool:
+    """判断文件是否在指定路径下"""
+    for p in path:
+        if file.is_relative_to(p):
+            return True
+    return False
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ref", type=str, help="Git reference (commit hash) to compare against."
+    )
+    parser.add_argument(
+        "--path", nargs="*", type=Path, help="Files to format or check."
+    )
+    parser.add_argument(
+        "--check", action="store_true", help="Check files without modifying them."
+    )
+    parser.add_argument(
+        "--c", default="clang-format-16", help="C formatter (default: clang-format-16)"
+    )
+    parser.add_argument(
+        "--py", default="black", help="Python formatter (default: black)"
+    )
+    args = parser.parse_args()
+
+    if args.ref is None and args.path is None:
+        # Last commit.
+        print("{Fore.GREEN}Formating git added files.{Style.RESET_ALL}")
+        files = git_added_files()
+
+    else:
+        if args.ref is None:
+            print(f"{Fore.GREEN}Formating files in {args.path}.{Style.RESET_ALL}")
+            files = list_files(args.path)
+        elif args.path is None:
+            print(
+                f"{Fore.GREEN}Formating git modified files from {args.ref}.{Style.RESET_ALL}"
+            )
+            files = git_modified_since_ref(args.ref)
+        else:
+            print(
+                f"{Fore.GREEN}Formating git modified files from {args.ref} in {args.path}.{Style.RESET_ALL}"
+            )
+            files = (
+                file
+                for file in git_modified_since_ref(args.ref)
+                if filter_in_path(file, args.path)
+            )
+
+    formatted = True
+    for file in files:
+        if not format_file(
+            file,
+            args.check,
+            {
+                "c": args.c,
+                "py": args.py,
+            },
+        ):
+            formatted = False
+
+    if not formatted:
+        exit(1)
+
+
+if __name__ == "__main__":
+    main()

From 27ba98d149b9d827b353037d91fbe26af91bceff Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Mon, 17 Feb 2025 19:10:45 +0800
Subject: [PATCH 2/5] =?UTF-8?q?issue/52:=20=E4=BF=AE=E6=94=B9=20README.md?=
 =?UTF-8?q?=EF=BC=8C=E6=B7=BB=E5=8A=A0=E6=A0=BC=E5=BC=8F=E5=8C=96=E8=84=9A?=
 =?UTF-8?q?=E6=9C=AC=E4=BD=BF=E7=94=A8=E8=AF=B4=E6=98=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 README.md | 111 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 80 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index 2e77be855..b38f451e5 100644
--- a/README.md
+++ b/README.md
@@ -1,50 +1,99 @@
 # InfiniCore
 
-InfiniCore是一个跨平台统一编程工具集，为不同芯片平台的功能（包括计算、运行时、通信等）提供统一 C 语言接口。目前支持的芯片包括CPU、英伟达GPU、华为昇腾NPU、寒武纪MLU、摩尔线程GPU、天数智芯GPU、沐曦GPU、曙光DCU、昆仑芯。
+InfiniCore 是一个跨平台统一编程工具集，为不同芯片平台的功能（包括计算、运行时、通信等）提供统一 C 语言接口。目前支持的硬件和后端包括：
 
-## 一、使用说明
+- CPU；
+- CUDA
+  - 英伟达 GPU；
+  - 摩尔线程 GPU；
+  - 天数智芯 GPU；
+  - 沐曦 GPU；
+  - 曙光 DCU；
+- 华为昇腾 NPU；
+- 寒武纪 MLU；
+- 昆仑芯 XPU；
 
-### 1. 配置
+## 配置和使用
 
-#### 查看当前配置
+1. 项目配置
 
-```xmake
-xmake f -v
-```
+   - 查看当前配置
 
-#### 配置 CPU （默认配置）
+     ```shell
+     xmake f -v
+     ```
 
-```xmake
-xmake f -cv
-```
+   - 配置 CPU（默认配置）
 
-#### 配置加速卡
+     ```shell
+     xmake f -cv
+     ```
 
-```xmake
-# 英伟达
-# 可以指定 CUDA 路径， 一般环境变量为 `CUDA_HOME` 或者 `CUDA_ROOT`
-xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv
+   - 配置加速卡
 
-# 寒武纪
-xmake f --cambricon-mlu=true -cv
+     ```shell
+     # 英伟达
+     # 可以指定 CUDA 路径， 一般环境变量为 `CUDA_HOME` 或者 `CUDA_ROOT`
+     xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv
 
-# 华为昇腾
-xmake f --ascend-npu=true -cv
-```
+     # 寒武纪
+     xmake f --cambricon-mlu=true -cv
 
-### 2. 编译安装
+     # 华为昇腾
+     xmake f --ascend-npu=true -cv
+     ```
 
-```xmake
-xmake build && xmake install
-# 默认安装路径为 $HOME/.infini
-```
+2. 编译安装
+
+   默认安装路径为 `$HOME/.infini`。
+
+   ```shell
+   xmake build && xmake install
+   ```
+
+3. 设置环境变量
+
+   按输出提示设置 `INFINI_ROOT` 和 `LD_LIBRARY_PATH` 环境变量。
+
+4. 运行算子测试
 
-### 3. 设置环境变量
+   ```shell
+   python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend]
+   ```
 
-按输出提示设置 `INFINI_ROOT` 和 `LD_LIBRARY_PATH` 环境变量。
+## 开发指南
 
-### 4. 运行算子测试
+### 代码格式化
 
-```bash
-python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend]
+本项目使用 [`scripts/format.py`](/scripts/format.py) 脚本实现代码格式化检查和操作。
+
+使用
+
+```shell
+python scripts/format.py -h
+```
+
+查看脚本帮助信息：
+
+```plaintext
+usage: format.py [-h] [--ref REF] [--path [PATH ...]] [--check] [--c C] [--py PY]
+
+options:
+  -h, --help         show this help message and exit
+  --ref REF          Git reference (commit hash) to compare against.
+  --path [PATH ...]  Files to format or check.
+  --check            Check files without modifying them.
+  --c C              C formatter (default: clang-format-16)
+  --py PY            Python formatter (default: black)
 ```
+
+参数中：
+
+- `ref` 和 `path` 控制格式化的文件范围
+  - 若 `ref` 和 `path` 都为空，格式化当前暂存（git added）的文件；
+  - 否则
+    - 若 `ref` 非空，将比较指定 commit 和当前代码的差异，只格式化修改过的文件；
+    - 若 `path` 非空，可传入多个路径（`--path p0 p1 p2`），只格式化指定路径及其子目录中的文件；
+- 若设置 `--check`，将检查代码是否需要修改格式，不修改文件内容；
+- 通过 `--c` 指定 c/c++ 格式化器，默认为 `clang-format-16`；
+- 通过 `--python` 指定 python 格式化器 `black`；

From ec0ff89340a115420d2097fadc5c78be704a9ed2 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Mon, 17 Feb 2025 18:40:11 +0800
Subject: [PATCH 3/5] =?UTF-8?q?issue/52:=20=E6=A0=BC=E5=BC=8F=E5=8C=96?=
 =?UTF-8?q?=E6=89=80=E6=9C=89=20c/c++=20=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 include/infinicore.h                          |   3 +-
 include/infiniop/ops/causal_softmax.h         |   1 -
 include/infiniop/ops/conv.h                   |   1 -
 include/infiniop/ops/random_sample.h          |   1 -
 include/infiniop/tensor_descriptor.h          |   2 +-
 src/infiniop/devices/ascend/common_ascend.cc  | 176 +++++++++---------
 src/infiniop/devices/ascend/common_ascend.h   |   1 -
 src/infiniop/devices/ascend/tensor_aclnn.cc   |   7 +-
 src/infiniop/devices/cpu/common_cpu.cc        |   9 +-
 src/infiniop/devices/cpu/common_cpu.h         |   4 +-
 src/infiniop/devices/cuda/common_cuda.cuh     |  44 ++---
 src/infiniop/devices/pool.h                   |   6 +-
 src/infiniop/ops/causal_softmax/operator.cc   | 142 +++++++-------
 .../ops/matmul/ascend/matmul_aclnn.cc         |  12 +-
 src/infiniop/ops/matmul/bang/matmul_cnnl.cc   |   4 +-
 src/infiniop/ops/matmul/blas.h                |   4 +-
 src/infiniop/ops/matmul/cpu/matmul_cpu.cc     |  19 +-
 src/infiniop/ops/matmul/cuda/matmul_cuda.cu   |   2 +-
 src/infiniop/ops/matmul/cuda/matmul_cuda.cuh  |   6 +-
 .../ops/matmul/cuda/matmul_cuda_api.h         |   2 -
 .../ops/matmul/cuda/matmul_cuda_kernel.cu     |   4 +-
 src/infiniop/ops/random_sample/operator.cc    | 136 +++++++-------
 src/infiniop/ops/rearrange/operator.cc        | 114 ++++++------
 src/infiniop/ops/rms_norm/operator.cc         | 160 ++++++++--------
 src/infiniop/ops/utils.h                      |  64 +++----
 25 files changed, 447 insertions(+), 477 deletions(-)

diff --git a/include/infinicore.h b/include/infinicore.h
index 42c134f6d..969b93599 100644
--- a/include/infinicore.h
+++ b/include/infinicore.h
@@ -6,8 +6,7 @@
 #define __INFINICORE_EXPORT_C__
 #if defined(_WIN32)
 #define __export __declspec(dllexport)
-#elif defined(__GNUC__) &&                                                     \
-    ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#elif defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
 #define __export __attribute__((visibility("default")))
 #else
 #define __export
diff --git a/include/infiniop/ops/causal_softmax.h b/include/infiniop/ops/causal_softmax.h
index 78bbaf475..e52579032 100644
--- a/include/infiniop/ops/causal_softmax.h
+++ b/include/infiniop/ops/causal_softmax.h
@@ -19,5 +19,4 @@ __C __export infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescrip
 
 __C __export infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc);
 
-
 #endif
diff --git a/include/infiniop/ops/conv.h b/include/infiniop/ops/conv.h
index 2b95048fb..ace912ce5 100644
--- a/include/infiniop/ops/conv.h
+++ b/include/infiniop/ops/conv.h
@@ -21,5 +21,4 @@ __C __export infiniopStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *
 
 __C __export infiniopStatus_t infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc);
 
-
 #endif
diff --git a/include/infiniop/ops/random_sample.h b/include/infiniop/ops/random_sample.h
index 48137737d..6be5713b5 100644
--- a/include/infiniop/ops/random_sample.h
+++ b/include/infiniop/ops/random_sample.h
@@ -22,5 +22,4 @@ __C __export infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescripto
 
 __C __export infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc);
 
-
 #endif
diff --git a/include/infiniop/tensor_descriptor.h b/include/infiniop/tensor_descriptor.h
index 03e526996..f97a4a4a6 100644
--- a/include/infiniop/tensor_descriptor.h
+++ b/include/infiniop/tensor_descriptor.h
@@ -21,4 +21,4 @@ __C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescr
 
 __C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc);
 
-#endif// __INFINIOP_TENSOR_DESCRIPTOR__
+#endif // __INFINIOP_TENSOR_DESCRIPTOR__
diff --git a/src/infiniop/devices/ascend/common_ascend.cc b/src/infiniop/devices/ascend/common_ascend.cc
index 3b42b38ca..7a2798d6e 100644
--- a/src/infiniop/devices/ascend/common_ascend.cc
+++ b/src/infiniop/devices/ascend/common_ascend.cc
@@ -31,115 +31,115 @@ infiniopStatus_t freeWorkspace(void *workspaceAddr) {
 }
 
 aclDataType toAclDataType(infiniDtype_t dt) {
-    if (dt == INFINI_DTYPE_I8)
+    if (dt == INFINI_DTYPE_I8) {
         return aclDataType::ACL_INT8;
-    else if (dt == INFINI_DTYPE_I16)
+    } else if (dt == INFINI_DTYPE_I16) {
         return aclDataType::ACL_INT16;
-    else if (dt == INFINI_DTYPE_I32)
+    } else if (dt == INFINI_DTYPE_I32) {
         return aclDataType::ACL_INT32;
-    else if (dt == INFINI_DTYPE_I64)
+    } else if (dt == INFINI_DTYPE_I64) {
         return aclDataType::ACL_INT64;
-    else if (dt == INFINI_DTYPE_U8)
+    } else if (dt == INFINI_DTYPE_U8) {
         return aclDataType::ACL_UINT8;
-    else if (dt == INFINI_DTYPE_U16)
+    } else if (dt == INFINI_DTYPE_U16) {
         return aclDataType::ACL_UINT16;
-    else if (dt == INFINI_DTYPE_U32)
+    } else if (dt == INFINI_DTYPE_U32) {
         return aclDataType::ACL_UINT32;
-    else if (dt == INFINI_DTYPE_U64)
+    } else if (dt == INFINI_DTYPE_U64) {
         return aclDataType::ACL_UINT64;
-    else if (dt == INFINI_DTYPE_F16)
+    } else if (dt == INFINI_DTYPE_F16) {
         return aclDataType::ACL_FLOAT16;
-    else if (dt == INFINI_DTYPE_BF16)
+    } else if (dt == INFINI_DTYPE_BF16) {
         return aclDataType::ACL_BF16;
-    else if (dt == INFINI_DTYPE_F32)
+    } else if (dt == INFINI_DTYPE_F32) {
         return aclDataType::ACL_FLOAT;
-    else if (dt == INFINI_DTYPE_F64)
+    } else if (dt == INFINI_DTYPE_F64) {
         return aclDataType::ACL_DOUBLE;
-    else
+    } else {
         return aclDataType::ACL_DT_UNDEFINED;
+    }
 }
 
-
 const char *dataTypeToString(aclDataType dtype) {
     switch (dtype) {
-        case ACL_DT_UNDEFINED:
-            return "ACL_DT_UNDEFINED";
-        case ACL_FLOAT:
-            return "ACL_FLOAT";
-        case ACL_FLOAT16:
-            return "ACL_FLOAT16";
-        case ACL_INT8:
-            return "ACL_INT8";
-        case ACL_INT32:
-            return "ACL_INT32";
-        case ACL_UINT8:
-            return "ACL_UINT8";
-        case ACL_INT16:
-            return "ACL_INT16";
-        case ACL_UINT16:
-            return "ACL_UINT16";
-        case ACL_UINT32:
-            return "ACL_UINT32";
-        case ACL_INT64:
-            return "ACL_INT64";
-        case ACL_UINT64:
-            return "ACL_UINT64";
-        case ACL_DOUBLE:
-            return "ACL_DOUBLE";
-        case ACL_BOOL:
-            return "ACL_BOOL";
-        case ACL_STRING:
-            return "ACL_STRING";
-        case ACL_COMPLEX64:
-            return "ACL_COMPLEX64";
-        case ACL_COMPLEX128:
-            return "ACL_COMPLEX128";
-        case ACL_BF16:
-            return "ACL_BF16";
-        case ACL_INT4:
-            return "ACL_INT4";
-        case ACL_UINT1:
-            return "ACL_UINT1";
-        case ACL_COMPLEX32:
-            return "ACL_COMPLEX32";
-        default:
-            return "UNKNOWN";
+    case ACL_DT_UNDEFINED:
+        return "ACL_DT_UNDEFINED";
+    case ACL_FLOAT:
+        return "ACL_FLOAT";
+    case ACL_FLOAT16:
+        return "ACL_FLOAT16";
+    case ACL_INT8:
+        return "ACL_INT8";
+    case ACL_INT32:
+        return "ACL_INT32";
+    case ACL_UINT8:
+        return "ACL_UINT8";
+    case ACL_INT16:
+        return "ACL_INT16";
+    case ACL_UINT16:
+        return "ACL_UINT16";
+    case ACL_UINT32:
+        return "ACL_UINT32";
+    case ACL_INT64:
+        return "ACL_INT64";
+    case ACL_UINT64:
+        return "ACL_UINT64";
+    case ACL_DOUBLE:
+        return "ACL_DOUBLE";
+    case ACL_BOOL:
+        return "ACL_BOOL";
+    case ACL_STRING:
+        return "ACL_STRING";
+    case ACL_COMPLEX64:
+        return "ACL_COMPLEX64";
+    case ACL_COMPLEX128:
+        return "ACL_COMPLEX128";
+    case ACL_BF16:
+        return "ACL_BF16";
+    case ACL_INT4:
+        return "ACL_INT4";
+    case ACL_UINT1:
+        return "ACL_UINT1";
+    case ACL_COMPLEX32:
+        return "ACL_COMPLEX32";
+    default:
+        return "UNKNOWN";
     }
 }
 
 const char *formatToString(aclFormat format) {
     switch (format) {
-        case ACL_FORMAT_UNDEFINED:
-            return "ACL_FORMAT_UNDEFINED";
-        case ACL_FORMAT_NCHW:
-            return "ACL_FORMAT_NCHW";
-        case ACL_FORMAT_NHWC:
-            return "ACL_FORMAT_NHWC";
-        case ACL_FORMAT_ND:
-            return "ACL_FORMAT_ND";
-        case ACL_FORMAT_NC1HWC0:
-            return "ACL_FORMAT_NC1HWC0";
-        case ACL_FORMAT_FRACTAL_Z:
-            return "ACL_FORMAT_FRACTAL_Z";
-        case ACL_FORMAT_NC1HWC0_C04:
-            return "ACL_FORMAT_NC1HWC0_C04";
-        case ACL_FORMAT_HWCN:
-            return "ACL_FORMAT_HWCN";
-        case ACL_FORMAT_NDHWC:
-            return "ACL_FORMAT_NDHWC";
-        case ACL_FORMAT_FRACTAL_NZ:
-            return "ACL_FORMAT_FRACTAL_NZ";
-        case ACL_FORMAT_NCDHW:
-            return "ACL_FORMAT_NCDHW";
-        case ACL_FORMAT_NDC1HWC0:
-            return "ACL_FORMAT_NDC1HWC0";
-        case ACL_FRACTAL_Z_3D:
-            return "ACL_FRACTAL_Z_3D";
-        case ACL_FORMAT_NC:
-            return "ACL_FORMAT_NC";
-        case ACL_FORMAT_NCL:
-            return "ACL_FORMAT_NCL";
-        default:
-            return "UNKNOWN";
+    case ACL_FORMAT_UNDEFINED:
+        return "ACL_FORMAT_UNDEFINED";
+    case ACL_FORMAT_NCHW:
+        return "ACL_FORMAT_NCHW";
+    case ACL_FORMAT_NHWC:
+        return "ACL_FORMAT_NHWC";
+    case ACL_FORMAT_ND:
+        return "ACL_FORMAT_ND";
+    case ACL_FORMAT_NC1HWC0:
+        return "ACL_FORMAT_NC1HWC0";
+    case ACL_FORMAT_FRACTAL_Z:
+        return "ACL_FORMAT_FRACTAL_Z";
+    case ACL_FORMAT_NC1HWC0_C04:
+        return "ACL_FORMAT_NC1HWC0_C04";
+    case ACL_FORMAT_HWCN:
+        return "ACL_FORMAT_HWCN";
+    case ACL_FORMAT_NDHWC:
+        return "ACL_FORMAT_NDHWC";
+    case ACL_FORMAT_FRACTAL_NZ:
+        return "ACL_FORMAT_FRACTAL_NZ";
+    case ACL_FORMAT_NCDHW:
+        return "ACL_FORMAT_NCDHW";
+    case ACL_FORMAT_NDC1HWC0:
+        return "ACL_FORMAT_NDC1HWC0";
+    case ACL_FRACTAL_Z_3D:
+        return "ACL_FRACTAL_Z_3D";
+    case ACL_FORMAT_NC:
+        return "ACL_FORMAT_NC";
+    case ACL_FORMAT_NCL:
+        return "ACL_FORMAT_NCL";
+    default:
+        return "UNKNOWN";
     }
 }
diff --git a/src/infiniop/devices/ascend/common_ascend.h b/src/infiniop/devices/ascend/common_ascend.h
index 96fc8a636..ef406cc25 100644
--- a/src/infiniop/devices/ascend/common_ascend.h
+++ b/src/infiniop/devices/ascend/common_ascend.h
@@ -34,7 +34,6 @@ extern "C" {
         return INFINIOP_STATUS_INTERNAL_ERROR; \
     } while (0)
 
-
 #ifdef __cplusplus
 };
 #endif
diff --git a/src/infiniop/devices/ascend/tensor_aclnn.cc b/src/infiniop/devices/ascend/tensor_aclnn.cc
index 55cc30c0c..359110f07 100644
--- a/src/infiniop/devices/ascend/tensor_aclnn.cc
+++ b/src/infiniop/devices/ascend/tensor_aclnn.cc
@@ -21,7 +21,6 @@ infiniopStatus_t aclnnTensorDescriptor::setDescriptor(aclDataType dtype, const s
     return INFINIOP_STATUS_SUCCESS;
 }
 
-
 /// @brief Infer storage shape. For now this ruturns a 1D shape of the total tensor storage size.
 /// We don't see why higher dimensional storage shape is ever needed. To change if necesary.
 infiniopStatus_t aclnnTensorDescriptor::inferStorageShape() {
@@ -93,8 +92,10 @@ char *aclnnTensorDescriptor::toString() {
 
     // Assume bufferSize
     size_t bufferSize = 1024 + this->ndim * 40 + this->storageNdim * 40;
-    char *buffer = (char *) malloc(bufferSize);
-    if (!buffer) return NULL;
+    char *buffer = (char *)malloc(bufferSize);
+    if (!buffer) {
+        return NULL;
+    }
 
     // Write info into buffer
     char *ptr = buffer;
diff --git a/src/infiniop/devices/cpu/common_cpu.cc b/src/infiniop/devices/cpu/common_cpu.cc
index ea62adb1e..7178584e9 100644
--- a/src/infiniop/devices/cpu/common_cpu.cc
+++ b/src/infiniop/devices/cpu/common_cpu.cc
@@ -35,11 +35,10 @@ float f16_to_f32(uint16_t h) {
 
 uint16_t f32_to_f16(float val) {
     uint32_t f32;
-    memcpy(&f32, &val, sizeof(f32));      // Read the bits of the float32
-    uint16_t sign = (f32 >> 16) & 0x8000; // Extract the sign bit
-    int32_t exponent =
-        ((f32 >> 23) & 0xFF) - 127;     // Extract and de-bias the exponent
-    uint32_t mantissa = f32 & 0x7FFFFF; // Extract the mantissa (fraction part)
+    memcpy(&f32, &val, sizeof(f32));               // Read the bits of the float32
+    uint16_t sign = (f32 >> 16) & 0x8000;          // Extract the sign bit
+    int32_t exponent = ((f32 >> 23) & 0xFF) - 127; // Extract and de-bias the exponent
+    uint32_t mantissa = f32 & 0x7FFFFF;            // Extract the mantissa (fraction part)
 
     if (exponent >= 31) { // Special cases for Inf and NaN
         // NaN
diff --git a/src/infiniop/devices/cpu/common_cpu.h b/src/infiniop/devices/cpu/common_cpu.h
index 311bfce9e..01105ee30 100644
--- a/src/infiniop/devices/cpu/common_cpu.h
+++ b/src/infiniop/devices/cpu/common_cpu.h
@@ -19,7 +19,7 @@ size_t indexToReducedOffset(size_t flat_index, size_t ndim, int64_t const *broad
 size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape, int64_t const *strides);
 
 /**
- * get the total array size (element count) after applying padding for a 
+ * get the total array size (element count) after applying padding for a
  * ndim-ary tensor with the given shape
  */
 size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads);
@@ -27,4 +27,4 @@ size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads);
 // calculate the padded shape and store the result in padded_shape
 std::vector<size_t> getPaddedShape(size_t ndim, size_t const *shape, size_t const *pads);
 
-#endif// __INFINIOP__COMMON_CPU_H__
+#endif // __INFINIOP__COMMON_CPU_H__
diff --git a/src/infiniop/devices/cuda/common_cuda.cuh b/src/infiniop/devices/cuda/common_cuda.cuh
index c1cea98e1..5f61d18a6 100644
--- a/src/infiniop/devices/cuda/common_cuda.cuh
+++ b/src/infiniop/devices/cuda/common_cuda.cuh
@@ -47,18 +47,18 @@ struct InfiniopCudaHandle {
     int compute_capability_minor;
 };
 
-template<typename T>
+template <typename T>
 void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool, int device_id, cudaStream_t stream, T const &f) {
     auto handle = cublas_handle_pool->pop();
     if (!handle) {
         cublasCreate(&(*handle));
     }
-    cublasSetStream(*handle, (cudaStream_t) stream);
+    cublasSetStream(*handle, (cudaStream_t)stream);
     f(*handle);
     cublas_handle_pool->push(std::move(*handle));
 }
 
-template<typename T>
+template <typename T>
 cudnnStatus_t use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handle_pool, int device_id, cudaStream_t stream, T const &f) {
     auto handle = cudnn_handle_pool->pop();
     if (!handle) {
@@ -72,24 +72,24 @@ cudnnStatus_t use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handle_pool,
 
 inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
     switch (dt) {
-        case INFINI_DTYPE_F16:
-            return CUDNN_DATA_HALF;
-        case INFINI_DTYPE_F32:
-            return CUDNN_DATA_FLOAT;
-        case INFINI_DTYPE_F64:
-            return CUDNN_DATA_DOUBLE;
-        case INFINI_DTYPE_BF16:
-            return CUDNN_DATA_BFLOAT16;
-        case INFINI_DTYPE_I8:
-            return CUDNN_DATA_INT8;
-        case INFINI_DTYPE_I32:
-            return CUDNN_DATA_INT32;
-        case INFINI_DTYPE_I64:
-            return CUDNN_DATA_INT64;
-        case INFINI_DTYPE_U8:
-            return CUDNN_DATA_UINT8;
-        default:
-            return CUDNN_DATA_FLOAT;
+    case INFINI_DTYPE_F16:
+        return CUDNN_DATA_HALF;
+    case INFINI_DTYPE_F32:
+        return CUDNN_DATA_FLOAT;
+    case INFINI_DTYPE_F64:
+        return CUDNN_DATA_DOUBLE;
+    case INFINI_DTYPE_BF16:
+        return CUDNN_DATA_BFLOAT16;
+    case INFINI_DTYPE_I8:
+        return CUDNN_DATA_INT8;
+    case INFINI_DTYPE_I32:
+        return CUDNN_DATA_INT32;
+    case INFINI_DTYPE_I64:
+        return CUDNN_DATA_INT64;
+    case INFINI_DTYPE_U8:
+        return CUDNN_DATA_UINT8;
+    default:
+        return CUDNN_DATA_FLOAT;
     }
 }
 
@@ -118,4 +118,4 @@ inline __device__ __host__ size_t indexToOffset(size_t flat_index, size_t ndim,
     return res;
 }
 
-#endif// __INFINIOP_COMMON_CUDA_H__
+#endif // __INFINIOP_COMMON_CUDA_H__
diff --git a/src/infiniop/devices/pool.h b/src/infiniop/devices/pool.h
index 015e1761b..951de7134 100644
--- a/src/infiniop/devices/pool.h
+++ b/src/infiniop/devices/pool.h
@@ -5,7 +5,7 @@
 #include <mutex>
 #include <optional>
 
-template<class T>
+template <class T>
 class Pool {
 public:
     Pool() : _head(nullptr) {}
@@ -21,7 +21,7 @@ class Pool {
     void push(T &&val) const {
         Node<T> *new_node = new Node<T>(std::move(val));
         new_node->next = _head.load();
-        while (!_head.compare_exchange_weak(new_node->next, new_node));
+        while (!_head.compare_exchange_weak(new_node->next, new_node)) {}
     }
 
     std::optional<T> pop() const {
@@ -37,7 +37,7 @@ class Pool {
     }
 
 private:
-    template<class U>
+    template <class U>
     struct Node {
         U data;
         Node<U> *next;
diff --git a/src/infiniop/ops/causal_softmax/operator.cc b/src/infiniop/ops/causal_softmax/operator.cc
index bb547683f..b906f68c2 100644
--- a/src/infiniop/ops/causal_softmax/operator.cc
+++ b/src/infiniop/ops/causal_softmax/operator.cc
@@ -6,35 +6,35 @@ __C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
     infiniopTensorDescriptor_t y_desc) {
     switch (handle->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuCreateCausalSoftmaxDescriptor(handle, (CausalSoftmaxCpuDescriptor_t *) desc_ptr, y_desc);
+    case DevCpu:
+        return cpuCreateCausalSoftmaxDescriptor(handle, (CausalSoftmaxCpuDescriptor_t *)desc_ptr, y_desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaCreateCausalSoftmaxDescriptor((CudaHandle_t)handle, (CausalSoftmaxCudaDescriptor_t *) desc_ptr, y_desc);
-        }
+    case DevNvGpu: {
+        return cudaCreateCausalSoftmaxDescriptor((CudaHandle_t)handle, (CausalSoftmaxCudaDescriptor_t *)desc_ptr, y_desc);
+    }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxBangDescriptor_t *) desc_ptr, y_desc);
-            // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
-        }
+    case DevCambriconMlu: {
+        return bangCreateCausalSoftmaxDescriptor((BangHandle_t)handle, (CausalSoftmaxBangDescriptor_t *)desc_ptr, y_desc);
+        // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnCreateCausalSoftmaxDescriptor((AscendHandle_t) handle, (CausalSoftmaxAclnnDescriptor_t *) desc_ptr, y_desc);
-        }
+    case DevAscendNpu: {
+        return aclnnCreateCausalSoftmaxDescriptor((AscendHandle_t)handle, (CausalSoftmaxAclnnDescriptor_t *)desc_ptr, y_desc);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaCreateCausalSoftmaxDescriptor((MacaHandle_t) handle, (CausalSoftmaxMacaDescriptor_t *) desc_ptr, y_desc);
-        }
+    case DevMetaxGpu: {
+        return macaCreateCausalSoftmaxDescriptor((MacaHandle_t)handle, (CausalSoftmaxMacaDescriptor_t *)desc_ptr, y_desc);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaCreateCausalSoftmaxDescriptor((MusaHandle_t) handle, (CausalSoftmaxMusaDescriptor_t *) desc_ptr, y_desc);
-        }
+    case DevMthreadsGpu: {
+        return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc);
+    }
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -43,36 +43,36 @@ __C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
 __C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, uint64_t *size) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCpuDescriptor_t) desc, size);
+    case DevCpu:
+        return cpuGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCpuDescriptor_t)desc, size);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCudaDescriptor_t) desc, size);
-        }
+    case DevNvGpu: {
+        return cudaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCudaDescriptor_t)desc, size);
+    }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t) desc, size);
-            // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size);
-        }
+    case DevCambriconMlu: {
+        return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t)desc, size);
+        // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size);
+    }
 
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnGetCausalSoftmaxWorkspaceSize((CausalSoftmaxAclnnDescriptor_t) desc, size);
-        }
+    case DevAscendNpu: {
+        return aclnnGetCausalSoftmaxWorkspaceSize((CausalSoftmaxAclnnDescriptor_t)desc, size);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t) desc, size);
-        }
+    case DevMetaxGpu: {
+        return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t)desc, size);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t) desc, size);
-        }
+    case DevMthreadsGpu: {
+        return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size);
+    }
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -81,35 +81,35 @@ __C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmax
 __C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *data, void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuCausalSoftmax((CausalSoftmaxCpuDescriptor_t) desc, workspace, workspace_size, data, stream);
+    case DevCpu:
+        return cpuCausalSoftmax((CausalSoftmaxCpuDescriptor_t)desc, workspace, workspace_size, data, stream);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaCausalSoftmax((CausalSoftmaxCudaDescriptor_t) desc, workspace, workspace_size, data, stream);
-        }
+    case DevNvGpu: {
+        return cudaCausalSoftmax((CausalSoftmaxCudaDescriptor_t)desc, workspace, workspace_size, data, stream);
+    }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t) desc, workspace, workspace_size, data, stream);
-            // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
-        }
+    case DevCambriconMlu: {
+        return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t)desc, workspace, workspace_size, data, stream);
+        // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnCausalSoftmax((CausalSoftmaxAclnnDescriptor_t) desc, workspace, workspace_size, data, stream);
-        }
+    case DevAscendNpu: {
+        return aclnnCausalSoftmax((CausalSoftmaxAclnnDescriptor_t)desc, workspace, workspace_size, data, stream);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t) desc, workspace, workspace_size, data, stream);
-        }
+    case DevMetaxGpu: {
+        return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t)desc, workspace, workspace_size, data, stream);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t) desc, workspace, workspace_size, data, stream);
-        }
+    case DevMthreadsGpu: {
+        return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream);
+    }
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -118,34 +118,34 @@ __C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t des
 __C infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuDestroyCausalSoftmaxDescriptor((CausalSoftmaxCpuDescriptor_t) desc);
+    case DevCpu:
+        return cpuDestroyCausalSoftmaxDescriptor((CausalSoftmaxCpuDescriptor_t)desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaDestroyCausalSoftmaxDescriptor((CausalSoftmaxCudaDescriptor_t) desc);
-        }
+    case DevNvGpu: {
+        return cudaDestroyCausalSoftmaxDescriptor((CausalSoftmaxCudaDescriptor_t)desc);
+    }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t) desc);
-            // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
-        }
+    case DevCambriconMlu: {
+        return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t)desc);
+        // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnDestroyCausalSoftmaxDescriptor((CausalSoftmaxAclnnDescriptor_t) desc);
-        }
+    case DevAscendNpu: {
+        return aclnnDestroyCausalSoftmaxDescriptor((CausalSoftmaxAclnnDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t) desc);
-        }
+    case DevMetaxGpu: {
+        return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu:
-            return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t) desc);
+    case DevMthreadsGpu:
+        return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc);
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
diff --git a/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc b/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
index 642eee7bd..aa25db3ec 100644
--- a/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
+++ b/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
@@ -123,17 +123,13 @@ infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, void *workspace,
 
     for (size_t i = 0; i < batch; i++) {
         AclSetTensorAddr(desc->executor, 0, ta,
-                         (char *)(a) + i * desc->info->a_matrix.stride *
-                                           infiniSizeof(desc->dtype));
+                         (char *)(a) + i * desc->info->a_matrix.stride * infiniSizeof(desc->dtype));
         AclSetTensorAddr(desc->executor, 1, tb,
-                         (char *)(b) + i * desc->info->b_matrix.stride *
-                                           infiniSizeof(desc->dtype));
+                         (char *)(b) + i * desc->info->b_matrix.stride * infiniSizeof(desc->dtype));
         AclSetTensorAddr(desc->executor, 2, tc,
-                         (char *)(c) + i * desc->info->c_matrix.stride *
-                                           infiniSizeof(desc->dtype));
+                         (char *)(c) + i * desc->info->c_matrix.stride * infiniSizeof(desc->dtype));
         AclSetTensorAddr(desc->executor, 3, tc,
-                         (char *)(c) + i * desc->info->c_matrix.stride *
-                                           infiniSizeof(desc->dtype));
+                         (char *)(c) + i * desc->info->c_matrix.stride * infiniSizeof(desc->dtype));
         ret = aclnnGemm(workspace, workspaceSize, desc->executor, stream);
         CHECK_RET(ret == ACL_SUCCESS,
                   LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
diff --git a/src/infiniop/ops/matmul/bang/matmul_cnnl.cc b/src/infiniop/ops/matmul/bang/matmul_cnnl.cc
index 10388551f..444bfdca1 100644
--- a/src/infiniop/ops/matmul/bang/matmul_cnnl.cc
+++ b/src/infiniop/ops/matmul/bang/matmul_cnnl.cc
@@ -73,8 +73,8 @@ bangDestroyMatmulDescriptor(infiniopMatmulBangDescriptor_t desc) {
 }
 
 void bangMatmulCnnl(infiniopMatmulBangDescriptor_t desc, void *workspace, void *c,
-                 float beta, void const *a, void const *b, float alpha,
-                 void *stream) {
+                    float beta, void const *a, void const *b, float alpha,
+                    void *stream) {
     auto info = desc->info;
     if (info.is_transed) {
         std::swap(a, b);
diff --git a/src/infiniop/ops/matmul/blas.h b/src/infiniop/ops/matmul/blas.h
index 2f671e746..51f027bcf 100644
--- a/src/infiniop/ops/matmul/blas.h
+++ b/src/infiniop/ops/matmul/blas.h
@@ -88,7 +88,7 @@ struct MatmulInfo {
             return;
         }
 
-        if (c_matrix.rows != a_matrix.rows || c_matrix.cols != b_matrix.cols || a_matrix.cols != b_matrix.rows){
+        if (c_matrix.rows != a_matrix.rows || c_matrix.cols != b_matrix.cols || a_matrix.cols != b_matrix.rows) {
             *status = INFINIOP_STATUS_BAD_TENSOR_SHAPE;
             return;
         }
@@ -113,4 +113,4 @@ struct MatmulInfo {
     }
 };
 
-#endif// __BLAS_H__
+#endif // __BLAS_H__
diff --git a/src/infiniop/ops/matmul/cpu/matmul_cpu.cc b/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
index 000d41370..7e0313304 100644
--- a/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+++ b/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
@@ -38,8 +38,8 @@ cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc) {
 
 template <typename Tdata>
 infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
-                            float beta, void const *a, void const *b,
-                            float alpha) {
+                                    float beta, void const *a, void const *b,
+                                    float alpha) {
     auto info = desc->info;
 
     if (info.is_transed) {
@@ -49,20 +49,11 @@ infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
     for (size_t i = 0; i < info.batch; ++i) {
         for (size_t m_ = 0; m_ < info.m; ++m_) {
             for (size_t n_ = 0; n_ < info.n; ++n_) {
-                auto c_ = reinterpret_cast<Tdata *>(c) +
-                          i * info.c_matrix.stride +
-                          m_ * info.c_matrix.row_stride +
-                          n_ * info.c_matrix.col_stride;
+                auto c_ = reinterpret_cast<Tdata *>(c) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride;
                 float sum = 0;
                 for (size_t k_ = 0; k_ < info.k; ++k_) {
-                    auto a_ = reinterpret_cast<Tdata const *>(a) +
-                              i * info.a_matrix.stride +
-                              m_ * info.a_matrix.row_stride +
-                              k_ * info.a_matrix.col_stride;
-                    auto b_ = reinterpret_cast<Tdata const *>(b) +
-                              i * info.b_matrix.stride +
-                              n_ * info.b_matrix.col_stride +
-                              k_ * info.b_matrix.row_stride;
+                    auto a_ = reinterpret_cast<Tdata const *>(a) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride;
+                    auto b_ = reinterpret_cast<Tdata const *>(b) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride;
                     if constexpr (std::is_same<Tdata, uint16_t>::value) {
                         sum += f16_to_f32(*a_) * f16_to_f32(*b_);
                     } else {
diff --git a/src/infiniop/ops/matmul/cuda/matmul_cuda.cu b/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
index 6c7da5a61..e3af48824 100644
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
@@ -1,5 +1,5 @@
-#include "./matmul_cuda.cuh"
 #include "../../utils.h"
+#include "./matmul_cuda.cuh"
 
 infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
                                             infiniopMatmulCudaDescriptor_t *desc_ptr,
diff --git a/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh b/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
index 6b7d1d4f7..04bb2756f 100644
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
@@ -1,10 +1,10 @@
 #ifndef __INFINIOP_MATMUL_CUDA_H__
 #define __INFINIOP_MATMUL_CUDA_H__
 
-#include "matmul_cuda_api.h"
 #include "../../../devices/cuda/common_cuda.cuh"
-#include <memory>
 #include "../blas.h"
+#include "matmul_cuda_api.h"
+#include <memory>
 
 typedef struct InfiniopMatmulCudaDescriptor {
     infiniDevice_t device;
@@ -14,4 +14,4 @@ typedef struct InfiniopMatmulCudaDescriptor {
     std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool;
 } InfiniopMatmulCudaDescriptor;
 
-#endif// __INFINIOP_MATMUL_CUDA_H__
+#endif // __INFINIOP_MATMUL_CUDA_H__
diff --git a/src/infiniop/ops/matmul/cuda/matmul_cuda_api.h b/src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
index 1207d8bbe..c484a689a 100644
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
@@ -4,7 +4,6 @@
 #include "../../../devices/cuda/cuda_handle.h"
 #include "infiniop/operator.h"
 
-
 struct InfiniopMatmulCudaDescriptor;
 typedef struct InfiniopMatmulCudaDescriptor *infiniopMatmulCudaDescriptor_t;
 
@@ -28,5 +27,4 @@ infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
 
 infiniopStatus_t cudaDestroyMatmulDescriptor(infiniopMatmulCudaDescriptor_t desc);
 
-
 #endif // __INFINIOP_MATMUL_CUDA_API_H__
diff --git a/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu b/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
index 30a69d4c5..8713f519e 100644
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
@@ -1,7 +1,7 @@
 #include "../../utils.h"
 #include "./matmul_cuda.cuh"
 
-template<typename Tdata>
+template <typename Tdata>
 infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
     auto info = desc->info;
 
@@ -26,7 +26,7 @@ infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c,
     auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
     auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
 
-    use_cublas(desc->cublas_handle_pool, desc->device_id, (cudaStream_t) stream,
+    use_cublas(desc->cublas_handle_pool, desc->device_id, (cudaStream_t)stream,
                [&](cublasHandle_t handle) { cublasGemmStridedBatchedEx(
                                                 handle,
                                                 op_a,
diff --git a/src/infiniop/ops/random_sample/operator.cc b/src/infiniop/ops/random_sample/operator.cc
index b308274a2..f879e3aab 100644
--- a/src/infiniop/ops/random_sample/operator.cc
+++ b/src/infiniop/ops/random_sample/operator.cc
@@ -3,36 +3,36 @@
 __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs) {
     switch (handle->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *) desc_ptr, result, probs);
+    case DevCpu:
+        return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *)desc_ptr, result, probs);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            return cudaCreateRandomSampleDescriptor((CudaHandle_t) handle, (RandomSampleCudaDescriptor_t *) desc_ptr, result, probs);
+    case DevNvGpu:
+        return cudaCreateRandomSampleDescriptor((CudaHandle_t)handle, (RandomSampleCudaDescriptor_t *)desc_ptr, result, probs);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangCreateRandomSampleDescriptor((BangHandle_t) handle,
-                                                    (RandomSampleBangDescriptor_t *) desc_ptr, result,
-                                                    probs);
-        }
+    case DevCambriconMlu: {
+        return bangCreateRandomSampleDescriptor((BangHandle_t)handle,
+                                                (RandomSampleBangDescriptor_t *)desc_ptr, result,
+                                                probs);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return ascendCreateRandomSampleDescriptor((AscendHandle_t) handle,
-                                                     (RandomSampleAscendDescriptor_t *) desc_ptr, result, probs);
-        }
+    case DevAscendNpu: {
+        return ascendCreateRandomSampleDescriptor((AscendHandle_t)handle,
+                                                  (RandomSampleAscendDescriptor_t *)desc_ptr, result, probs);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaCreateRandomSampleDescriptor((MacaHandle_t) handle,
-                                                    (RandomSampleMacaDescriptor_t *) desc_ptr, result,
-                                                    probs);
-        }
+    case DevMetaxGpu: {
+        return macaCreateRandomSampleDescriptor((MacaHandle_t)handle,
+                                                (RandomSampleMacaDescriptor_t *)desc_ptr, result,
+                                                probs);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu:
-            return musaCreateRandomSampleDescriptor((MusaHandle_t) handle, (RandomSampleMusaDescriptor_t *) desc_ptr, result, probs);
+    case DevMthreadsGpu:
+        return musaCreateRandomSampleDescriptor((MusaHandle_t)handle, (RandomSampleMusaDescriptor_t *)desc_ptr, result, probs);
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -41,35 +41,35 @@ __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handl
 __C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t) desc, size);
+    case DevCpu:
+        return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t)desc, size);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t) desc, size);
-        }
+    case DevNvGpu: {
+        return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t)desc, size);
+    }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t) desc, size);
-            // return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
-        }
+    case DevCambriconMlu: {
+        return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t)desc, size);
+        // return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t) desc, size);
-        }
+    case DevAscendNpu: {
+        return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t)desc, size);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t) desc, size);
-        }
+    case DevMetaxGpu: {
+        return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t)desc, size);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t) desc, size);
-        }
+    case DevMthreadsGpu: {
+        return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t)desc, size);
+    }
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -87,31 +87,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
                                           void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuRandomSample((RandomSampleCpuDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    case DevCpu:
+        return cpuRandomSample((RandomSampleCpuDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            return cudaRandomSample((RandomSampleCudaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    case DevNvGpu:
+        return cudaRandomSample((RandomSampleCudaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangRandomSample((RandomSampleBangDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
-        }
+    case DevCambriconMlu: {
+        return bangRandomSample((RandomSampleBangDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return ascendRandomSample((RandomSampleAscendDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
-        }
+    case DevAscendNpu: {
+        return ascendRandomSample((RandomSampleAscendDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaRandomSample((RandomSampleMacaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
-        }
+    case DevMetaxGpu: {
+        return macaRandomSample((RandomSampleMacaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu:
-            return musaRandomSample((RandomSampleMusaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    case DevMthreadsGpu:
+        return musaRandomSample((RandomSampleMusaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -120,31 +120,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
 __C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t) desc);
+    case DevCpu:
+        return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t)desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t) desc);
+    case DevNvGpu:
+        return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t)desc);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t) desc);
-        }
+    case DevCambriconMlu: {
+        return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t) desc);
-        }
+    case DevAscendNpu: {
+        return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t) desc);
-        }
+    case DevMetaxGpu: {
+        return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu:
-            return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t) desc);
+    case DevMthreadsGpu:
+        return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t)desc);
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
diff --git a/src/infiniop/ops/rearrange/operator.cc b/src/infiniop/ops/rearrange/operator.cc
index 7274e5a14..8bee9a8a7 100644
--- a/src/infiniop/ops/rearrange/operator.cc
+++ b/src/infiniop/ops/rearrange/operator.cc
@@ -7,37 +7,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
     infiniopTensorDescriptor_t src) {
     switch (handle->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuCreateRearrangeDescriptor(handle, (RearrangeCpuDescriptor_t *) desc_ptr, dst, src);
+    case DevCpu:
+        return cpuCreateRearrangeDescriptor(handle, (RearrangeCpuDescriptor_t *)desc_ptr, dst, src);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaCreateRearrangeDescriptor((CudaHandle_t) handle, (RearrangeCudaDescriptor_t *) desc_ptr, dst, src);
-        }
+    case DevNvGpu: {
+        return cudaCreateRearrangeDescriptor((CudaHandle_t)handle, (RearrangeCudaDescriptor_t *)desc_ptr, dst, src);
+    }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangCreateRearrangeDescriptor((BangHandle_t) handle, (RearrangeBangDescriptor_t *) desc_ptr, dst, src);
-        }
+    case DevCambriconMlu: {
+        return bangCreateRearrangeDescriptor((BangHandle_t)handle, (RearrangeBangDescriptor_t *)desc_ptr, dst, src);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnCreateRearrangeDescriptor((AscendHandle_t) handle,
-                                                  (RearrangeAclnnDescriptor_t *) desc_ptr,
-                                                  dst,
-                                                  src);
-        }
+    case DevAscendNpu: {
+        return aclnnCreateRearrangeDescriptor((AscendHandle_t)handle,
+                                              (RearrangeAclnnDescriptor_t *)desc_ptr,
+                                              dst,
+                                              src);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaCreateRearrangeDescriptor((MacaHandle_t) handle, (RearrangeMacaDescriptor_t *) desc_ptr, dst, src);
-        }
+    case DevMetaxGpu: {
+        return macaCreateRearrangeDescriptor((MacaHandle_t)handle, (RearrangeMacaDescriptor_t *)desc_ptr, dst, src);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaCreateRearrangeDescriptor((MusaHandle_t)handle, (RearrangeMusaDescriptor_t *) desc_ptr, dst, src);
-        }
+    case DevMthreadsGpu: {
+        return musaCreateRearrangeDescriptor((MusaHandle_t)handle, (RearrangeMusaDescriptor_t *)desc_ptr, dst, src);
+    }
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -46,37 +46,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
 __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void *dst, void const *src, void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuRearrange((RearrangeCpuDescriptor_t) desc, dst, src, stream);
+    case DevCpu:
+        return cpuRearrange((RearrangeCpuDescriptor_t)desc, dst, src, stream);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaRearrange((RearrangeCudaDescriptor_t) desc, dst, src, stream);
-        }
+    case DevNvGpu: {
+        return cudaRearrange((RearrangeCudaDescriptor_t)desc, dst, src, stream);
+    }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangRearrange((RearrangeBangDescriptor_t) desc, dst, src, stream);
-        }
+    case DevCambriconMlu: {
+        return bangRearrange((RearrangeBangDescriptor_t)desc, dst, src, stream);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnRearrange((RearrangeAclnnDescriptor_t) desc,
-                                  dst,
-                                  src,
-                                  stream);
-        }
+    case DevAscendNpu: {
+        return aclnnRearrange((RearrangeAclnnDescriptor_t)desc,
+                              dst,
+                              src,
+                              stream);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaRearrange((RearrangeMacaDescriptor_t) desc, dst, src, stream);
-        }
+    case DevMetaxGpu: {
+        return macaRearrange((RearrangeMacaDescriptor_t)desc, dst, src, stream);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaRearrange((RearrangeMusaDescriptor_t) desc, dst, src, stream);
-        }
+    case DevMthreadsGpu: {
+        return musaRearrange((RearrangeMusaDescriptor_t)desc, dst, src, stream);
+    }
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -85,34 +85,34 @@ __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void
 __C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescriptor_t desc) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuDestroyRearrangeDescriptor((RearrangeCpuDescriptor_t) desc);
+    case DevCpu:
+        return cpuDestroyRearrangeDescriptor((RearrangeCpuDescriptor_t)desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaDestroyRearrangeDescriptor((RearrangeCudaDescriptor_t) desc);
-        }
+    case DevNvGpu: {
+        return cudaDestroyRearrangeDescriptor((RearrangeCudaDescriptor_t)desc);
+    }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangDestroyRearrangeDescriptor((RearrangeBangDescriptor_t) desc);
-        }
+    case DevCambriconMlu: {
+        return bangDestroyRearrangeDescriptor((RearrangeBangDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t) desc);
-        }
+    case DevAscendNpu: {
+        return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaDestroyRearrangeDescriptor((RearrangeMacaDescriptor_t) desc);
-        }
+    case DevMetaxGpu: {
+        return macaDestroyRearrangeDescriptor((RearrangeMacaDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaDestroyRearrangeDescriptor((RearrangeMusaDescriptor_t) desc);
-        }
+    case DevMthreadsGpu: {
+        return musaDestroyRearrangeDescriptor((RearrangeMusaDescriptor_t)desc);
+    }
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
diff --git a/src/infiniop/ops/rms_norm/operator.cc b/src/infiniop/ops/rms_norm/operator.cc
index 8d6eac076..43438ff2a 100644
--- a/src/infiniop/ops/rms_norm/operator.cc
+++ b/src/infiniop/ops/rms_norm/operator.cc
@@ -9,38 +9,38 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
     float epsilon) {
     switch (handle->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuCreateRMSNormDescriptor(handle, (RMSNormCpuDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+    case DevCpu:
+        return cpuCreateRMSNormDescriptor(handle, (RMSNormCpuDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaCreateRMSNormDescriptor((CudaHandle_t) handle, (RMSNormCudaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
-        }
+    case DevNvGpu: {
+        return cudaCreateRMSNormDescriptor((CudaHandle_t)handle, (RMSNormCudaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
+    }
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangCreateRMSNormDescriptor((BangHandle_t) handle, (RMSNormBangDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
-        }
+    case DevCambriconMlu: {
+        return bangCreateRMSNormDescriptor((BangHandle_t)handle, (RMSNormBangDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnCreateRMSNormDescriptor((AscendHandle_t) handle,
-                                                (RMSNormAclnnDescriptor_t *) desc_ptr,
-                                                y_desc,
-                                                x_desc,
-                                                w_desc,
-                                                epsilon);
-        }
+    case DevAscendNpu: {
+        return aclnnCreateRMSNormDescriptor((AscendHandle_t)handle,
+                                            (RMSNormAclnnDescriptor_t *)desc_ptr,
+                                            y_desc,
+                                            x_desc,
+                                            w_desc,
+                                            epsilon);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaCreateRMSNormDescriptor((MacaHandle_t) handle, (RMSNormMacaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
-        }
+    case DevMetaxGpu: {
+        return macaCreateRMSNormDescriptor((MacaHandle_t)handle, (RMSNormMacaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaCreateRMSNormDescriptor((MusaHandle_t) handle, (RMSNormMusaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
-        }
+    case DevMthreadsGpu: {
+        return musaCreateRMSNormDescriptor((MusaHandle_t)handle, (RMSNormMusaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
+    }
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -49,35 +49,35 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
 __C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuGetRMSNormWorkspaceSize((RMSNormCpuDescriptor_t) desc, size);
+    case DevCpu:
+        return cpuGetRMSNormWorkspaceSize((RMSNormCpuDescriptor_t)desc, size);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t) desc, size);
-        }
+    case DevNvGpu: {
+        return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t)desc, size);
+    }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t) desc, size);
-        }
+    case DevCambriconMlu: {
+        return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t)desc, size);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t) desc,
-                                                size);
-        }
+    case DevAscendNpu: {
+        return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t)desc,
+                                            size);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t) desc, size);
-        }
+    case DevMetaxGpu: {
+        return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t)desc, size);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaGetRMSNormWorkspaceSize((RMSNormMusaDescriptor_t) desc, size);
-        }
+    case DevMthreadsGpu: {
+        return musaGetRMSNormWorkspaceSize((RMSNormMusaDescriptor_t)desc, size);
+    }
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -87,40 +87,40 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
                                      void *y, void const *x, void const *w, void *stream) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuRMSNorm((RMSNormCpuDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+    case DevCpu:
+        return cpuRMSNorm((RMSNormCpuDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaRMSNorm((RMSNormCudaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
-        }
+    case DevNvGpu: {
+        return cudaRMSNorm((RMSNormCudaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
+    }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangRMSNorm((RMSNormBangDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
-        }
+    case DevCambriconMlu: {
+        return bangRMSNorm((RMSNormBangDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnRMSNorm((RMSNormAclnnDescriptor_t) desc,
-                                workspace,
-                                workspace_size,
-                                y,
-                                x,
-                                w,
-                                stream);
-        }
+    case DevAscendNpu: {
+        return aclnnRMSNorm((RMSNormAclnnDescriptor_t)desc,
+                            workspace,
+                            workspace_size,
+                            y,
+                            x,
+                            w,
+                            stream);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaRMSNorm((RMSNormMacaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
-        }
+    case DevMetaxGpu: {
+        return macaRMSNorm((RMSNormMacaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaRMSNorm((RMSNormMusaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
-        }
+    case DevMthreadsGpu: {
+        return musaRMSNorm((RMSNormMusaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
+    }
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -129,34 +129,34 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
 __C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc) {
     switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuDestroyRMSNormDescriptor((RMSNormCpuDescriptor_t) desc);
+    case DevCpu:
+        return cpuDestroyRMSNormDescriptor((RMSNormCpuDescriptor_t)desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t) desc);
-        }
+    case DevNvGpu: {
+        return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t)desc);
+    }
 
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t) desc);
-        }
+    case DevCambriconMlu: {
+        return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t) desc);
-        }
+    case DevAscendNpu: {
+        return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t) desc);
-        }
+    case DevMetaxGpu: {
+        return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaDestroyRMSNormDescriptor((RMSNormMusaDescriptor_t) desc);
-        }
+    case DevMthreadsGpu: {
+        return musaDestroyRMSNormDescriptor((RMSNormMusaDescriptor_t)desc);
+    }
 #endif
     }
     return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
diff --git a/src/infiniop/ops/utils.h b/src/infiniop/ops/utils.h
index cbcc37f6a..1ad1eebe0 100644
--- a/src/infiniop/ops/utils.h
+++ b/src/infiniop/ops/utils.h
@@ -13,28 +13,28 @@
 
 #define ROUND_UP_DIV(x, y) ((x + y - 1) / y)
 
-#define CHECK_ERROR(call, target, errCode)                                     \
-    do {                                                                       \
-        if (auto value = (call); value == (target)) {                          \
-            std::cerr << "Error: expected " << (target) << " but got "         \
-                      << value << " in file " << __FILE__ << ", function "     \
-                      << __func__ << ", line " << __LINE__ << std::endl;       \
-            return (errCode);                                                  \
-        }                                                                      \
+#define CHECK_ERROR(call, target, errCode)                                 \
+    do {                                                                   \
+        if (auto value = (call); value == (target)) {                      \
+            std::cerr << "Error: expected " << (target) << " but got "     \
+                      << value << " in file " << __FILE__ << ", function " \
+                      << __func__ << ", line " << __LINE__ << std::endl;   \
+            return (errCode);                                              \
+        }                                                                  \
     } while (0)
 
-#define CREATE_CHECK_ERROR(expr, value, target, errCode)                       \
-    expr;                                                                      \
+#define CREATE_CHECK_ERROR(expr, value, target, errCode) \
+    expr;                                                \
     CHECK_ERROR(value, target, errCode)
 
-#define CHECK_STATUS(call, target)                                             \
-    do {                                                                       \
-        if (auto value = (call); value != (target)) {                          \
-            std::cerr << "Error: expected " << (target) << " but got "         \
-                      << value << " in file " << __FILE__ << ", function "     \
-                      << __func__ << ", line " << __LINE__ << std::endl;       \
-            return value;                                                      \
-        }                                                                      \
+#define CHECK_STATUS(call, target)                                         \
+    do {                                                                   \
+        if (auto value = (call); value != (target)) {                      \
+            std::cerr << "Error: expected " << (target) << " but got "     \
+                      << value << " in file " << __FILE__ << ", function " \
+                      << __func__ << ", line " << __LINE__ << std::endl;   \
+            return value;                                                  \
+        }                                                                  \
     } while (0)
 
 inline std::vector<int64_t> getByteStrides(infiniopTensorDescriptor_t desc) {
@@ -67,8 +67,7 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
 
     // compute broadcasted shape
     for (size_t i = 0; i < max_rank; ++i) {
-        if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 ||
-            padded_shape2[i] == 1) {
+        if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) {
             broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]);
         } else {
             return false;
@@ -89,10 +88,7 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
     auto broadcast_shape = broadcast_shape_.data(),
          padded_shape1 = padded_shape1_.data(),
          padded_shape2 = padded_shape2_.data();
-    if (broadcast_ndim != c->ndim ||
-        !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim,
-                           broadcast_shape, padded_shape1, padded_shape2,
-                           broadcast_ndim)) {
+    if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) {
         return false;
     }
     return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim,
@@ -126,7 +122,6 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
     return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim));
 }
 
-
 // permute the dimensions of a tensor descriptor
 inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
                                           const std::vector<size_t> &order) {
@@ -149,10 +144,9 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
 // check if the dimensions [dim_start, dim_end] of a tensor descriptor are
 // contiguous
 inline bool isContiguous(const infiniopTensorDescriptor_t &desc,
-                          size_t dim_start, size_t dim_end) {
+                         size_t dim_start, size_t dim_end) {
     for (size_t i = dim_start + 1; i <= dim_end; i++) {
-        if (desc->strides[i - 1] !=
-            static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
+        if (desc->strides[i - 1] != static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
             return false;
         }
     }
@@ -168,7 +162,7 @@ inline bool isContiguous(const infiniopTensorDescriptor_t &desc) {
 
 // merge the dimensions [dim_start, dim_end] of a tensor descriptor
 inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc,
-                                            size_t dim_start, size_t dim_end) {
+                                           size_t dim_start, size_t dim_end) {
     size_t ndim = desc->ndim;
     if (dim_start > dim_end || dim_end >= ndim) {
         return nullptr;
@@ -203,11 +197,10 @@ inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc,
 
 // split the dimension dim of a tensor descriptor into multiple dimensions
 inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
-                                            size_t dim,
-                                            const std::vector<size_t> &dims) {
+                                           size_t dim,
+                                           const std::vector<size_t> &dims) {
     size_t ndim = desc->ndim;
-    if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1,
-                                            std::multiplies{})) {
+    if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies{})) {
         return nullptr;
     }
     size_t new_ndim = ndim + dims.size() - 1;
@@ -221,10 +214,7 @@ inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
     }
     for (size_t i = 0; i < dims.size(); i++) {
         new_shape[index] = dims[i];
-        new_strides[index] =
-            desc->strides[dim] * desc->shape[dim] /
-            std::accumulate(dims.begin(), dims.begin() + i + 1, (size_t)1,
-                            std::multiplies<size_t>());
+        new_strides[index] = desc->strides[dim] * desc->shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, (size_t)1, std::multiplies<size_t>());
         index++;
     }
     for (size_t i = dim + 1; i < ndim; i++) {

From 7bd656b7d0c0f5df69c07306b6d3bb8ffcb4a160 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Mon, 17 Feb 2025 18:52:14 +0800
Subject: [PATCH 4/5] =?UTF-8?q?issue/52:=20=E6=A0=BC=E5=BC=8F=E5=8C=96?=
 =?UTF-8?q?=E6=89=80=E6=9C=89=20python=20=E6=96=87=E4=BB=B6=EF=BC=8C?=
 =?UTF-8?q?=E5=B9=B6=E6=A0=87=E6=B3=A8=E6=8E=92=E9=99=A4=E6=A0=BC=E5=BC=8F?=
 =?UTF-8?q?=E5=8C=96=E7=9A=84=E5=8C=BA=E5=9F=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 test/infiniop/__init__.py              |   2 +-
 test/infiniop/add.py                   |  24 +++++-
 test/infiniop/avg_pool.py              |  28 +++++--
 test/infiniop/causal_softmax.py        |   4 +-
 test/infiniop/conv.py                  |  32 ++++----
 test/infiniop/expand.py                |  14 +++-
 test/infiniop/gemm.py                  |  23 ++++--
 test/infiniop/global_avg_pool.py       |   7 +-
 test/infiniop/libinfiniop/__init__.py  |  10 ++-
 test/infiniop/libinfiniop/datatypes.py |   2 +-
 test/infiniop/libinfiniop/utils.py     | 101 ++++++++++++++++++-------
 test/infiniop/matmul.py                |  68 +++++++++++------
 test/infiniop/max_pool.py              |  22 ++++--
 test/infiniop/random_sample.py         |  71 ++++++++++-------
 test/infiniop/rearrange.py             |  12 +--
 test/infiniop/relu.py                  |  15 +++-
 test/infiniop/rms_norm.py              |  48 ++++++++----
 test/infiniop/rotary_embedding.py      |  14 ++--
 test/infiniop/swiglu.py                |  25 ++++--
 19 files changed, 370 insertions(+), 152 deletions(-)

diff --git a/test/infiniop/__init__.py b/test/infiniop/__init__.py
index 1cde77ac7..81bb65771 100644
--- a/test/infiniop/__init__.py
+++ b/test/infiniop/__init__.py
@@ -1 +1 @@
-import libinfiniop
\ No newline at end of file
+import libinfiniop
diff --git a/test/infiniop/add.py b/test/infiniop/add.py
index 455014cca..0a17fcecb 100644
--- a/test/infiniop/add.py
+++ b/test/infiniop/add.py
@@ -41,8 +41,8 @@ def test(
     lib,
     handle,
     torch_device,
-    c_shape, 
-    a_shape, 
+    c_shape,
+    a_shape,
     b_shape,
     tensor_dtype=torch.float16,
     inplace=Inplace.OUT_OF_PLACE,
@@ -56,13 +56,21 @@ def test(
 
     a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
     b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
-    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+    c = (
+        torch.rand(c_shape, dtype=tensor_dtype).to(torch_device)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
+    )
 
     ans = add(a, b)
 
     a_tensor = to_tensor(a, lib)
     b_tensor = to_tensor(b, lib)
-    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    c_tensor = (
+        to_tensor(c, lib)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    )
     descriptor = infiniopAddDescriptor_t()
 
     check_error(
@@ -91,8 +99,10 @@ def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
+        # fmt: off
         test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
         test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -100,8 +110,10 @@ def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
+        # fmt: off
         test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
         test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -111,13 +123,16 @@ def test_bang(lib, test_cases):
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for c_shape, a_shape, b_shape, inplace in test_cases:
+        # fmt: off
         test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
         test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
+        # fmt: off
         # c_shape, a_shape, b_shape, inplace
         # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
         # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
@@ -133,6 +148,7 @@ def test_bang(lib, test_cases):
         ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
         ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
         ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+        # fmt: on
     ]
     args = get_args()
     lib = open_lib()
diff --git a/test/infiniop/avg_pool.py b/test/infiniop/avg_pool.py
index 9c2407896..3fa30b12d 100644
--- a/test/infiniop/avg_pool.py
+++ b/test/infiniop/avg_pool.py
@@ -35,7 +35,7 @@ class AvgPoolDescriptor(Structure):
 infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)
 
 
-def pool(x, k, padding, stride, dilation = 1):
+def pool(x, k, padding, stride, dilation=1):
     pooling_layers = {
         1: torch.nn.AvgPool1d,
         2: torch.nn.AvgPool2d,
@@ -48,7 +48,9 @@ def pool(x, k, padding, stride, dilation = 1):
         return None
 
     if ndim == 3 and x.dtype == torch.float16:
-        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x.to(torch.float32)).to(torch.float16)
+        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(
+            x.to(torch.float32)
+        ).to(torch.float16)
     else:
         ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
     if PROFILE:
@@ -69,18 +71,20 @@ def inferShape(x_shape, kernel_shape, padding, strides):
 
     return x_shape[:2] + tuple(output_shape)
 
+
 # convert a python tuple to a ctype void pointer
 def tuple_to_void_p(py_tuple: Tuple):
     array = ctypes.c_int64 * len(py_tuple)
     data_array = array(*py_tuple)
     return ctypes.cast(data_array, ctypes.c_void_p)
 
+
 def test(
     lib,
     handle,
     torch_device,
-    x_shape, 
-    k_shape, 
+    x_shape,
+    k_shape,
     padding,
     strides,
     tensor_dtype=torch.float16,
@@ -90,7 +94,9 @@ def test(
     )
 
     x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(
+        inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype
+    ).to(torch_device)
 
     for i in range(NUM_PRERUN if PROFILE else 1):
         ans = pool(x, k_shape, padding, strides)
@@ -126,7 +132,9 @@ def test(
     check_error(
         lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
     )
-    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
+        torch_device
+    )
     workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
 
     for i in range(NUM_PRERUN if PROFILE else 1):
@@ -164,8 +172,10 @@ def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
+        # fmt: off
         test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
         test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -173,8 +183,10 @@ def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
+        # fmt: off
         test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
         test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -184,17 +196,21 @@ def test_bang(lib, test_cases):
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
+        # fmt: off
         test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
         test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
+        # fmt: off
         # x_shape, kernel_shape, padding, strides
         ((1, 1, 10), (3,), (1,), (1,)),
         ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
         ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
+        # fmt: on
     ]
     args = get_args()
     lib = open_lib()
diff --git a/test/infiniop/causal_softmax.py b/test/infiniop/causal_softmax.py
index 1ad304b27..a5c66bfbb 100644
--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
@@ -101,6 +101,7 @@ def test_bang(lib, test_cases):
         test(lib, handle, "mlu", x_shape, x_stride)
     destroy_handle(lib, handle)
 
+
 def test_ascend(lib, test_cases):
     import torch_npu
 
@@ -111,11 +112,12 @@ def test_ascend(lib, test_cases):
 
     destroy_handle(lib, handle)
 
+
 if __name__ == "__main__":
     test_cases = [
         # x_shape, x_stride
         ((32, 20, 512), None),
-        ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续
+        ((32, 20, 512), (20480, 512, 1)),  # Ascend 暂不支持非连续
     ]
     args = get_args()
     lib = open_lib()
diff --git a/test/infiniop/conv.py b/test/infiniop/conv.py
index 7e7ea953a..157967ade 100644
--- a/test/infiniop/conv.py
+++ b/test/infiniop/conv.py
@@ -41,17 +41,11 @@ class ConvDescriptor(Structure):
 def conv(x, w, stride, padding, dilation):
     match len(x.shape) - 2:
         case 1:
-            return F.conv1d(
-                x, w, stride=stride, padding=padding, dilation=dilation
-            )
+            return F.conv1d(x, w, stride=stride, padding=padding, dilation=dilation)
         case 2:
-            return F.conv2d(
-                x, w, stride=stride, padding=padding, dilation=dilation
-            )
+            return F.conv2d(x, w, stride=stride, padding=padding, dilation=dilation)
         case 3:
-            return F.conv3d(
-                x, w, stride=stride, padding=padding, dilation=dilation
-            )
+            return F.conv3d(x, w, stride=stride, padding=padding, dilation=dilation)
         case _:
             print("Error: Pytorch -> Unsupported tensor dimension")
             return None
@@ -66,11 +60,15 @@ def inferShape(
     dilations: List[int],
 ) -> Tuple[int, ...]:
     assert (
-        len(x_shape) == len(w_shape) == len(pads) + 2 == len(dilations) + 2 == len(strides) + 2
+        len(x_shape)
+        == len(w_shape)
+        == len(pads) + 2
+        == len(dilations) + 2
+        == len(strides) + 2
     ), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
     output_dims = [
         math.floor(
-            (x_shape[i+2] + 2 * pads[i] - dilations[i] * (w_shape[i+2] - 1) - 1)
+            (x_shape[i + 2] + 2 * pads[i] - dilations[i] * (w_shape[i + 2] - 1) - 1)
             / strides[i]
             + 1
         )
@@ -145,7 +143,9 @@ def test(
     check_error(
         lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
     )
-    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
+        torch_device
+    )
     workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
 
     for i in range(NUM_PRERUN if PROFILE else 1):
@@ -177,7 +177,7 @@ def test(
         elapsed = (time.time() - start_time) / NUM_ITERATIONS
         print(f"    lib time: {elapsed :6f}")
 
-    if (tensor_dtype == torch.float16):
+    if tensor_dtype == torch.float16:
         assert torch.allclose(y, ans, atol=0, rtol=1e-2)
     else:
         assert torch.allclose(y, ans, atol=0, rtol=1e-3)
@@ -188,8 +188,10 @@ def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        # fmt: off
         test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
         test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -197,8 +199,10 @@ def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        # fmt: off
         test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
         test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -208,8 +212,10 @@ def test_bang(lib, test_cases):
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        # fmt: off
         test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
         test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
diff --git a/test/infiniop/expand.py b/test/infiniop/expand.py
index e060ad73f..505766be8 100644
--- a/test/infiniop/expand.py
+++ b/test/infiniop/expand.py
@@ -47,10 +47,10 @@ def test(
     lib,
     handle,
     torch_device,
-    y_shape, 
+    y_shape,
     x_shape,
-    y_stride=None, 
-    x_stride=None, 
+    y_stride=None,
+    x_stride=None,
     tensor_dtype=torch.float16,
 ):
     print(
@@ -109,8 +109,10 @@ def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for y_shape, x_shape, y_stride, x_stride in test_cases:
+        # fmt: off
         test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
         test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -118,8 +120,10 @@ def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for y_shape, x_shape, y_stride, x_stride in test_cases:
+        # fmt: off
         test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
         test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -129,13 +133,16 @@ def test_bang(lib, test_cases):
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for y_shape, x_shape, y_stride, x_stride in test_cases:
+        # fmt: off
         test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
         test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":
     test_cases = [
+        # fmt: off
         # y_shape, x_shape, y_stride, x_stride
         ((), (), None, None),
         ((3, 3), (1,), None, None),
@@ -146,6 +153,7 @@ def test_bang(lib, test_cases):
         ((2, 3, 4, 5), (5,), None, None),
         ((3, 2, 4, 5), (3, 2, 1, 1), None, None),
         ((32, 256, 112, 112), (32, 256, 112, 1), None, None),
+        # fmt: on
     ]
     args = get_args()
     lib = open_lib()
diff --git a/test/infiniop/gemm.py b/test/infiniop/gemm.py
index 5da99eaca..9d738178d 100644
--- a/test/infiniop/gemm.py
+++ b/test/infiniop/gemm.py
@@ -27,6 +27,7 @@
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
+
 class GEMMDescriptor(Structure):
     _fields_ = [("device", c_int32)]
 
@@ -34,10 +35,15 @@ class GEMMDescriptor(Structure):
 infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor)
 
 
-def gemm(A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32):
+def gemm(
+    A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32
+):
     A = A.T if transA else A
     B = B.T if transB else B
-    result = alpha * torch.matmul(A if dtype != torch.float16 else A.to(torch.float32), B if dtype != torch.float16 else B.to(torch.float32)).to(dtype)
+    result = alpha * torch.matmul(
+        A if dtype != torch.float16 else A.to(torch.float32),
+        B if dtype != torch.float16 else B.to(torch.float32),
+    ).to(dtype)
     if C is not None:
         result += beta * C if dtype != torch.float16 else C.to(torch.float32)
     if PROFILE:
@@ -64,7 +70,7 @@ def test(
     dtype=torch.float16,
 ):
     print(
-        f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} " 
+        f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} "
         f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} "
         f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}"
     )
@@ -121,9 +127,7 @@ def test(
 
     workspace_size = ctypes.c_uint64(0)
     check_error(
-        lib.infiniopGetGEMMWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
+        lib.infiniopGetGEMMWorkspaceSize(descriptor, ctypes.byref(workspace_size))
     )
     workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to(
         torch_device
@@ -182,8 +186,10 @@ def test_cpu(lib, test_cases):
         c_stride,
         y_stride,
     ) in test_cases:
+        # fmt: off
         test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
         test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -204,8 +210,10 @@ def test_cuda(lib, test_cases):
         c_stride,
         y_stride,
     ) in test_cases:
+        # fmt: off
         test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
         test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -229,9 +237,10 @@ def test_bang(lib, test_cases):
         c_stride,
         y_stride,
     ) in test_cases:
+        # fmt: off
         test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
         test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
-
+        # fmt: on
     destroy_handle(lib, handle)
 
 
diff --git a/test/infiniop/global_avg_pool.py b/test/infiniop/global_avg_pool.py
index 33f7b64d0..f3a5cfebd 100644
--- a/test/infiniop/global_avg_pool.py
+++ b/test/infiniop/global_avg_pool.py
@@ -99,7 +99,12 @@ def test(
     for i in range(NUM_PRERUN if PROFILE else 1):
         check_error(
             lib.infiniopGlobalAvgPool(
-                descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
+                descriptor,
+                workspace_ptr,
+                workspaceSize,
+                y_tensor.data,
+                x_tensor.data,
+                None,
             )
         )
     if PROFILE:
diff --git a/test/infiniop/libinfiniop/__init__.py b/test/infiniop/libinfiniop/__init__.py
index 0831efba9..cf827d300 100644
--- a/test/infiniop/libinfiniop/__init__.py
+++ b/test/infiniop/libinfiniop/__init__.py
@@ -1,7 +1,13 @@
 import os
 import sys
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '.')))
-from .liboperators import open_lib, CTensor, infiniopHandle_t, infiniopTensorDescriptor_t
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
+from .liboperators import (
+    open_lib,
+    CTensor,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+)
 from .devices import *
 from .utils import *
 from .datatypes import *
diff --git a/test/infiniop/libinfiniop/datatypes.py b/test/infiniop/libinfiniop/datatypes.py
index f07977db4..f928904c9 100644
--- a/test/infiniop/libinfiniop/datatypes.py
+++ b/test/infiniop/libinfiniop/datatypes.py
@@ -7,7 +7,7 @@ class InfiniDtype:
     I32 = 5
     I64 = 6
     U8 = 7
-    U16 = 8 
+    U16 = 8
     U32 = 9
     U64 = 10
     F8 = 11
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index 83c69083a..cc103093d 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -54,6 +54,7 @@ def create_workspace(size, torch_device):
     if size == 0:
         return None
     import torch
+
     return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device)
 
 
@@ -172,6 +173,7 @@ def get_args():
 
 def synchronize_device(torch_device):
     import torch
+
     if torch_device == "cuda":
         torch.cuda.synchronize()
     elif torch_device == "npu":
@@ -197,13 +199,24 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
         If True, the function will print detailed information about any discrepancies between the tensors.
     """
     import numpy as np
+
     print_discrepancy(actual, desired, atol, rtol, verbose)
-    np.testing.assert_allclose(actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True, strict=True)
+    np.testing.assert_allclose(
+        actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True, strict=True
+    )
 
 
-def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
+def debug_all(
+    actual_vals: Sequence,
+    desired_vals: Sequence,
+    condition: str,
+    atol=0,
+    rtol=1e-2,
+    equal_nan=False,
+    verbose=True,
+):
     """
-    Debugging function to compare two sequences of values (actual and desired) pair by pair, results 
+    Debugging function to compare two sequences of values (actual and desired) pair by pair, results
     are linked by the given logical condition, and prints discrepancies
     Arguments:
     ----------
@@ -223,7 +236,10 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
     - AssertionError: If the specified `condition` is not 'or' or 'and'.
     """
     assert len(actual_vals) == len(desired_vals), "Invalid Length"
-    assert condition in {"or", "and"}, "Invalid condition: should be either 'or' or 'and'"
+    assert condition in {
+        "or",
+        "and",
+    }, "Invalid condition: should be either 'or' or 'and'"
     import numpy as np
 
     passed = False if condition == "or" else True
@@ -237,14 +253,22 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
         elif condition == "and":
             if passed and len(indices) != 0:
                 passed = False
-                print(f"\033[31mThe condition has not been satisfied: Condition #{index + 1}\033[0m")
-            np.testing.assert_allclose(actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True, strict=True)
+                print(
+                    f"\033[31mThe condition has not been satisfied: Condition #{index + 1}\033[0m"
+                )
+            np.testing.assert_allclose(
+                actual.cpu(),
+                desired.cpu(),
+                rtol,
+                atol,
+                equal_nan,
+                verbose=True,
+                strict=True,
+            )
     assert passed, "\033[31mThe condition has not been satisfied\033[0m"
 
 
-def print_discrepancy(
-    actual, expected, atol=0, rtol=1e-3, verbose=True
-):
+def print_discrepancy(actual, expected, atol=0, rtol=1e-3, verbose=True):
     if actual.shape != expected.shape:
         raise ValueError("Tensors must have the same shape to compare.")
 
@@ -273,7 +297,9 @@ def add_color(text, color_code):
         for idx in diff_indices:
             index_tuple = tuple(idx.tolist())
             actual_str = f"{actual[index_tuple]:<{col_width[1]}.{decimal_places[1]}f}"
-            expected_str = f"{expected[index_tuple]:<{col_width[2]}.{decimal_places[2]}f}"
+            expected_str = (
+                f"{expected[index_tuple]:<{col_width[2]}.{decimal_places[2]}f}"
+            )
             delta_str = f"{delta[index_tuple]:<{col_width[3]}.{decimal_places[3]}f}"
             print(
                 f" > Index: {str(index_tuple):<{col_width[0]}}"
@@ -287,10 +313,18 @@ def add_color(text, color_code):
         print(f"  - Desired dtype: {expected.dtype}")
         print(f"  - Atol: {atol}")
         print(f"  - Rtol: {rtol}")
-        print(f"  - Mismatched elements: {len(diff_indices)} / {actual.numel()} ({len(diff_indices) / actual.numel() * 100}%)")
-        print(f"  - Min(actual) : {torch.min(actual):<{col_width[1]}} | Max(actual) : {torch.max(actual):<{col_width[2]}}")
-        print(f"  - Min(desired): {torch.min(expected):<{col_width[1]}} | Max(desired): {torch.max(expected):<{col_width[2]}}")
-        print(f"  - Min(delta)  : {torch.min(delta):<{col_width[1]}} | Max(delta)  : {torch.max(delta):<{col_width[2]}}")
+        print(
+            f"  - Mismatched elements: {len(diff_indices)} / {actual.numel()} ({len(diff_indices) / actual.numel() * 100}%)"
+        )
+        print(
+            f"  - Min(actual) : {torch.min(actual):<{col_width[1]}} | Max(actual) : {torch.max(actual):<{col_width[2]}}"
+        )
+        print(
+            f"  - Min(desired): {torch.min(expected):<{col_width[1]}} | Max(desired): {torch.max(expected):<{col_width[2]}}"
+        )
+        print(
+            f"  - Min(delta)  : {torch.min(delta):<{col_width[1]}} | Max(delta)  : {torch.max(delta):<{col_width[2]}}"
+        )
         print("-" * total_width + "\n")
 
     return diff_indices
@@ -298,14 +332,17 @@ def add_color(text, color_code):
 
 def get_tolerance(tolerance_map, tensor_dtype, default_atol=0, default_rtol=1e-3):
     """
-    Returns the atol and rtol for a given tensor data type in the tolerance_map. 
+    Returns the atol and rtol for a given tensor data type in the tolerance_map.
     If the given data type is not found, it returns the provided default tolerance values.
     """
-    return tolerance_map.get(tensor_dtype, {'atol': default_atol, 'rtol': default_rtol}).values()
+    return tolerance_map.get(
+        tensor_dtype, {"atol": default_atol, "rtol": default_rtol}
+    ).values()
 
 
 def timed_op(func, num_iterations, device):
     import time
+
     """ Function for timing operations with synchronization. """
     synchronize_device(device)
     start = time.time()
@@ -318,7 +355,7 @@ def timed_op(func, num_iterations, device):
 def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
     """
     Unified profiling workflow that is used to profile the execution time of a given function.
-    It first performs a number of warmup runs, then performs timed execution and 
+    It first performs a number of warmup runs, then performs timed execution and
     prints the average execution time.
 
     Arguments:
@@ -328,11 +365,11 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
     - torch_device (str): The device on which the operation runs, provided for timed execution.
     - NUM_PRERUN (int): The number of warmup runs.
     - NUM_ITERATIONS (int): The number of timed execution iterations, used to calculate the average execution time.
-    """    
+    """
     # Warmup runs
     for _ in range(NUM_PRERUN):
         func()
-    
+
     # Timed execution
     elapsed = timed_op(lambda: func(), NUM_ITERATIONS, torch_device)
     print(f" {desc} time: {elapsed * 1000 :6f} ms")
@@ -347,7 +384,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
     - lib (ctypes.CDLL): The library object containing the operator implementations.
     - device (InfiniDeviceEnum): The device on which the operator should be tested. See device.py.
     - test_func (function): The test function to be executed for each test case.
-    - test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters 
+    - test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters
         to be passed to `test_func`.
     - tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test.
     """
@@ -355,7 +392,13 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
     try:
         for test_case in test_cases:
             for tensor_dtype in tensor_dtypes:
-                test_func(lib, handle, infiniDeviceEnum_str_map[device], *test_case, tensor_dtype)
+                test_func(
+                    lib,
+                    handle,
+                    infiniDeviceEnum_str_map[device],
+                    *test_case,
+                    tensor_dtype,
+                )
     finally:
         destroy_handle(lib, handle)
 
@@ -365,22 +408,26 @@ def get_test_devices(args):
     Using the given parsed Namespace to determine the devices to be tested.
 
     Argument:
-    - args: the parsed Namespace object. 
+    - args: the parsed Namespace object.
 
     Return:
     - devices_to_test: the devices that will be tested. Default is CPU.
     """
     devices_to_test = []
 
-    if args.cpu: devices_to_test.append(InfiniDeviceEnum.CPU)
-    if args.nvidia: devices_to_test.append(InfiniDeviceEnum.NVIDIA)
-    if args.cambricon: 
+    if args.cpu:
+        devices_to_test.append(InfiniDeviceEnum.CPU)
+    if args.nvidia:
+        devices_to_test.append(InfiniDeviceEnum.NVIDIA)
+    if args.cambricon:
         import torch_mlu
+
         devices_to_test.append(InfiniDeviceEnum.CAMBRICON)
-    if args.ascend: 
+    if args.ascend:
         import torch
         import torch_npu
-        torch.npu.set_device(0) # Ascend NPU needs explicit device initialization
+
+        torch.npu.set_device(0)  # Ascend NPU needs explicit device initialization
         devices_to_test.append(InfiniDeviceEnum.ASCEND)
     if not devices_to_test:
         devices_to_test = [InfiniDeviceEnum.CPU]
diff --git a/test/infiniop/matmul.py b/test/infiniop/matmul.py
index 06e6bec84..6dc5ac5fd 100644
--- a/test/infiniop/matmul.py
+++ b/test/infiniop/matmul.py
@@ -2,9 +2,19 @@
 import ctypes
 from ctypes import POINTER, Structure, c_int32, c_size_t, c_uint64, c_void_p, c_float
 from libinfiniop import (
-    infiniopHandle_t, infiniopTensorDescriptor_t, open_lib, to_tensor, get_test_devices, 
-    check_error, rearrange_if_needed, create_workspace, test_operator, get_args, 
-    debug, get_tolerance, profile_operation,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    open_lib,
+    to_tensor,
+    get_test_devices,
+    check_error,
+    rearrange_if_needed,
+    create_workspace,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
 )
 
 # ==============================================================================
@@ -21,8 +31,8 @@
     (1.0, 0.0, (1, 2048), (2048, 2048), (1, 2048), (4096, 1), (4096, 1), (4096, 1)),
     (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
     (1.0, 1.0, (6, 2048), (2048, 2560), (6, 2560), (2048, 1), (1, 2048), (2560, 1)),
-    (1.0/8.0, 0.0, (4, 8*6, 64), (4, 64, 6), (4, 8*6, 6), None, None, None),
-    (1.0/8.0, 0.0, (4, 8*6, 64), (4, 64, 6), (4, 8*6, 6), None, None, None),
+    (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
+    (1.0 / 8.0, 0.0, (4, 8 * 6, 64), (4, 64, 6), (4, 8 * 6, 6), None, None, None),
 ]
 
 # Data types used for testing
@@ -30,8 +40,8 @@
 
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {'atol': 0, 'rtol': 1e-2},
-    torch.float32: {'atol': 0, 'rtol': 1e-3},
+    torch.float16: {"atol": 0, "rtol": 1e-2},
+    torch.float32: {"atol": 0, "rtol": 1e-3},
 }
 
 DEBUG = False
@@ -39,6 +49,7 @@
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000
 
+
 # ==============================================================================
 #  Definitions
 # ==============================================================================
@@ -48,6 +59,7 @@ class MatmulDescriptor(Structure):
 
 infiniopMatmulDescriptor_t = POINTER(MatmulDescriptor)
 
+
 # PyTorch implementation for matrix multiplication
 def matmul(_c, beta, _a, _b, alpha):
     a, b, c = _a.clone(), _b.clone(), _c.clone()
@@ -55,6 +67,7 @@ def matmul(_c, beta, _a, _b, alpha):
     fp32_result = torch.matmul(a.to(torch.float32), b.to(torch.float32))
     return alpha * fp32_result.to(result_dtype) + beta * c
 
+
 # The argument list should be (lib, handle, torch_device, <param list>, dtype)
 # The <param list> should keep the same order as the one specified in _TEST_CASES
 def test(
@@ -85,7 +98,10 @@ def test(
     # Compute the PyTorch reference result
     ans = matmul(c, beta, a, b, alpha)
 
-    a, b, c = [rearrange_if_needed(tensor, stride) for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])]
+    a, b, c = [
+        rearrange_if_needed(tensor, stride)
+        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_stride])
+    ]
     a_tensor, b_tensor, c_tensor = [to_tensor(tensor, lib) for tensor in [a, b, c]]
 
     descriptor = infiniopMatmulDescriptor_t()
@@ -95,7 +111,7 @@ def test(
             ctypes.byref(descriptor),
             c_tensor.descriptor,
             a_tensor.descriptor,
-            b_tensor.descriptor
+            b_tensor.descriptor,
         )
     )
 
@@ -105,22 +121,27 @@ def test(
 
     # Get workspace size and create workspace
     workspace_size = c_uint64(0)
-    check_error(lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size)))
+    check_error(
+        lib.infiniopGetMatmulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
     workspace = create_workspace(workspace_size.value, a.device)
 
     # Execute infiniop matmul operator
     def lib_matmul():
-        check_error(lib.infiniopMatmul(
-            descriptor, 
-            workspace.data_ptr() if workspace is not None else None,
-            workspace_size.value,
-            c_tensor.data,
-            a_tensor.data,
-            b_tensor.data,
-            alpha,
-            beta,
-            None,
-        ))
+        check_error(
+            lib.infiniopMatmul(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                c_tensor.data,
+                a_tensor.data,
+                b_tensor.data,
+                alpha,
+                beta,
+                None,
+            )
+        )
+
     lib_matmul()
 
     # Validate results
@@ -131,9 +152,10 @@ def lib_matmul():
 
     # Profiling workflow
     if PROFILE:
+        # fmt: off
         profile_operation("PyTorch", lambda: matmul(c, beta, a, b, alpha), torch_device, NUM_PRERUN, NUM_ITERATIONS)
         profile_operation("    lib", lambda: lib_matmul(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
-
+        # fmt: on
     check_error(lib.infiniopDestroyMatmulDescriptor(descriptor))
 
 
@@ -150,7 +172,7 @@ def lib_matmul():
         POINTER(infiniopMatmulDescriptor_t),
         infiniopTensorDescriptor_t,
         infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t
+        infiniopTensorDescriptor_t,
     ]
 
     lib.infiniopGetMatmulWorkspaceSize.restype = c_int32
diff --git a/test/infiniop/max_pool.py b/test/infiniop/max_pool.py
index ffc0bb192..cfeca929b 100644
--- a/test/infiniop/max_pool.py
+++ b/test/infiniop/max_pool.py
@@ -35,7 +35,7 @@ class MaxPoolDescriptor(Structure):
 infiniopMaxPoolDescriptor_t = POINTER(MaxPoolDescriptor)
 
 
-def pool(x, k, padding, stride, dilation = 1):
+def pool(x, k, padding, stride, dilation=1):
     pooling_layers = {
         1: torch.nn.MaxPool1d,
         2: torch.nn.MaxPool2d,
@@ -66,18 +66,20 @@ def inferShape(x_shape, kernel_shape, padding, strides):
 
     return x_shape[:2] + tuple(output_shape)
 
+
 # convert a python tuple to a ctype void pointer
 def tuple_to_void_p(py_tuple: Tuple):
     array = ctypes.c_int64 * len(py_tuple)
     data_array = array(*py_tuple)
     return ctypes.cast(data_array, ctypes.c_void_p)
 
+
 def test(
     lib,
     handle,
     torch_device,
-    x_shape, 
-    k_shape, 
+    x_shape,
+    k_shape,
     padding,
     strides,
     tensor_dtype=torch.float16,
@@ -87,7 +89,9 @@ def test(
     )
 
     x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(
+        inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype
+    ).to(torch_device)
 
     for i in range(NUM_PRERUN if PROFILE else 1):
         ans = pool(x, k_shape, padding, strides)
@@ -123,7 +127,9 @@ def test(
     check_error(
         lib.infiniopGetMaxPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
     )
-    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
+        torch_device
+    )
     workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))
 
     for i in range(NUM_PRERUN if PROFILE else 1):
@@ -161,8 +167,10 @@ def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
+        # fmt: off
         test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
         test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -170,8 +178,10 @@ def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
+        # fmt: off
         test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
         test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -181,8 +191,10 @@ def test_bang(lib, test_cases):
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for x_shape, kernel_shape, padding, strides in test_cases:
+        # fmt: off
         test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
         test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
diff --git a/test/infiniop/random_sample.py b/test/infiniop/random_sample.py
index 98a8dcebe..a5eb143ab 100644
--- a/test/infiniop/random_sample.py
+++ b/test/infiniop/random_sample.py
@@ -30,13 +30,13 @@ class RandomSampleDescriptor(Structure):
 
 
 def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
-    indices = torch.zeros([topk], dtype = torch.int64)
+    indices = torch.zeros([topk], dtype=torch.int64)
     dataNp = data.clone().detach()
     sorted_indices = torch.arange(voc)
-    
+
     for i in range(topk):
         for j in range(i + 1, voc):
-            if(dataNp[i] < dataNp[j]):
+            if dataNp[i] < dataNp[j]:
                 tmp = dataNp[i].clone().detach()
                 dataNp[i] = dataNp[j].clone().detach()
                 dataNp[j] = tmp
@@ -44,48 +44,60 @@ def random_sample(data, random_val, topp, topk, voc, temperature, torch_device):
                 tmpInd = sorted_indices[i].clone().detach()
                 sorted_indices[i] = sorted_indices[j].clone().detach()
                 sorted_indices[j] = tmpInd
-                
-    #sorted_indices = torch.argsort(dataNp, descending=True)
-    indices = sorted_indices[:topk] 
-    
+
+    # sorted_indices = torch.argsort(dataNp, descending=True)
+    indices = sorted_indices[:topk]
+
     dataNp = dataNp[sorted_indices]
-    
+
     globalM = dataNp[0]
     dataNp = (dataNp - globalM) / temperature
-    dataNp = torch.softmax(dataNp.float(), dim = 0)
+    dataNp = torch.softmax(dataNp.float(), dim=0)
     sum_s = 0
     for end in range(topk):
         sum_s += dataNp[end]
-        if(sum_s >= topp):
+        if sum_s >= topp:
             break
-    if(end < topk - 1):
+    if end < topk - 1:
         end += 1
     else:
         end = topk
-    
+
     sum_s = 0
     for i in range(end):
         sum_s += dataNp[i]
     random_val *= sum_s
-    
+
     sum_s = 0
     for i in range(end):
         sum_s += dataNp[i]
-        if(random_val < sum_s):
+        if random_val < sum_s:
             return indices[i]
 
+
 def random_sample_0(data):
     return torch.argmax(data)
 
-def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_dtype=torch.float16):
-    print(
-        f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}"
-    )
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    voc,
+    random_val,
+    topp,
+    topk,
+    temperature,
+    x_dtype=torch.float16,
+):
+    print(f"Testing RandomSample on {torch_device} with voc:{voc} dtype:{x_dtype}")
     data = torch.arange(voc).float() * 0.0001
     _perm = torch.randperm(voc)
     data = data[_perm].to(x_dtype).to(torch_device)
-    if(topp > 0 and topk > 1):
-        ans = random_sample(data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu")
+    if topp > 0 and topk > 1:
+        ans = random_sample(
+            data.to("cpu"), random_val, topp, topk, voc, temperature, "cpu"
+        )
     else:
         ans = random_sample_0(data)
     indices = torch.zeros([1], dtype=torch.int64).to(torch_device)
@@ -96,7 +108,10 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
     descriptor = infiniopRandomSampleDescriptor_t()
     check_error(
         lib.infiniopCreateRandomSampleDescriptor(
-            handle, ctypes.byref(descriptor), indices_tensor.descriptor, x_tensor.descriptor
+            handle,
+            ctypes.byref(descriptor),
+            indices_tensor.descriptor,
+            x_tensor.descriptor,
         )
     )
 
@@ -110,7 +125,7 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
             descriptor, ctypes.byref(workspace_size)
         )
     )
-    workspace = create_workspace(workspace_size.value, torch_device) 
+    workspace = create_workspace(workspace_size.value, torch_device)
     check_error(
         lib.infiniopRandomSample(
             descriptor,
@@ -131,10 +146,11 @@ def test(lib, handle, torch_device, voc, random_val, topp, topk, temperature, x_
     assert indices[0].type(ans.dtype) == ans or data[ans] == data[indices[0]]
     check_error(lib.infiniopDestroyRandomSampleDescriptor(descriptor))
 
+
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for (voc, random_val, topp, topk, temperature) in test_cases:
+    for voc, random_val, topp, topk, temperature in test_cases:
         test(lib, handle, "cpu", voc, random_val, topp, topk, temperature)
     destroy_handle(lib, handle)
 
@@ -142,7 +158,7 @@ def test_cpu(lib, test_cases):
 def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
-    for (voc, random_val, topp, topk, temperature) in test_cases:
+    for voc, random_val, topp, topk, temperature in test_cases:
         test(lib, handle, "cuda", voc, random_val, topp, topk, temperature)
     destroy_handle(lib, handle)
 
@@ -152,16 +168,17 @@ def test_bang(lib, test_cases):
 
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
-    for (voc, random_val, topp, topk, temperature) in test_cases:
+    for voc, random_val, topp, topk, temperature in test_cases:
         test(lib, handle, "mlu", voc, random_val, topp, topk, temperature)
     destroy_handle(lib, handle)
 
 
 def test_ascend(lib, test_cases):
     import torch_npu
+
     device = DeviceEnum.DEVICE_ASCEND
     handle = create_handle(lib, device)
-    for (voc, random_val, topp, topk, temperature) in test_cases:
+    for voc, random_val, topp, topk, temperature in test_cases:
         test(lib, handle, "npu", voc, random_val, topp, topk, temperature)
     destroy_handle(lib, handle)
 
@@ -180,7 +197,7 @@ def test_ascend(lib, test_cases):
         (32000, 0.08, 1.0, 25, 1.0),
         # (119696, 0.01, 1.0, 100, 1.0),
     ]
-    
+
     args = get_args()
     lib = open_lib()
     lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
diff --git a/test/infiniop/rearrange.py b/test/infiniop/rearrange.py
index e9cc81b90..f9d5306c5 100644
--- a/test/infiniop/rearrange.py
+++ b/test/infiniop/rearrange.py
@@ -61,9 +61,7 @@ def test(
     x_tensor.descriptor.contents.invalidate()
     y_tensor.descriptor.contents.invalidate()
 
-    check_error(
-        lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None)
-    )
+    check_error(lib.infiniopRearrange(descriptor, y_tensor.data, x_tensor.data, None))
     assert torch.allclose(x, y, atol=0, rtol=1e-3)
     check_error(lib.infiniopDestroyRearrangeDescriptor(descriptor))
 
@@ -87,8 +85,10 @@ def test_cuda(lib, test_cases):
         test(lib, handle, "cuda", x_shape, x_stride, y_shape, y_stride)
     destroy_handle(lib, handle)
 
+
 def test_bang(lib, test_cases):
     import torch_mlu
+
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for test_case in test_cases:
@@ -97,6 +97,7 @@ def test_bang(lib, test_cases):
         test(lib, handle, "mlu", x_shape, x_stride, y_shape, y_stride)
     destroy_handle(lib, handle)
 
+
 def test_ascend(lib, test_cases):
     import torch_npu
 
@@ -106,7 +107,8 @@ def test_ascend(lib, test_cases):
         x_shape, x_stride = test_case[0]
         y_shape, y_stride = test_case[1]
         test(lib, handle, "npu", x_shape, x_stride, y_shape, y_stride)
-    destroy_handle(lib, handle) 
+    destroy_handle(lib, handle)
+
 
 if __name__ == "__main__":
     args = get_args()
@@ -119,7 +121,7 @@ def test_ascend(lib, test_cases):
         (((32, 1, 64), (64, 2560, 1)), ((32, 1, 64), (64, 64, 1))),
         (((4, 1, 64), (64, 2560, 1)), ((4, 1, 64), (64, 11264, 1))),
         (((64,), (1,)), ((64,), (1,))),
-        ]
+    ]
     lib = open_lib()
     lib.infiniopCreateRearrangeDescriptor.restype = c_int32
     lib.infiniopCreateRearrangeDescriptor.argtypes = [
diff --git a/test/infiniop/relu.py b/test/infiniop/relu.py
index b7f766273..8599c46ca 100644
--- a/test/infiniop/relu.py
+++ b/test/infiniop/relu.py
@@ -52,7 +52,7 @@ def test(
     lib,
     handle,
     torch_device,
-    tensor_shape, 
+    tensor_shape,
     tensor_dtype=torch.float16,
     inplace=Inplace.OUT_OF_PLACE,
 ):
@@ -61,7 +61,11 @@ def test(
     )
 
     x = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) * 2 - 1
-    y = torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else x
+    y = (
+        torch.rand(tensor_shape, dtype=tensor_dtype).to(torch_device)
+        if inplace == Inplace.OUT_OF_PLACE
+        else x
+    )
 
     for i in range(NUM_PRERUN if PROFILE else 1):
         ans = relu(x)
@@ -108,17 +112,22 @@ def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
     for tensor_shape, inplace in test_cases:
+        # fmt: off
         test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
         test(lib, handle, "cpu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
 def test_cuda(lib, test_cases):
+
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
     for tensor_shape, inplace in test_cases:
+        # fmt: off
         test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
         test(lib, handle, "cuda", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
@@ -128,8 +137,10 @@ def test_bang(lib, test_cases):
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for tensor_shape, inplace in test_cases:
+        # fmt: off
         test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float16, inplace=inplace)
         test(lib, handle, "mlu", tensor_shape, tensor_dtype=torch.float32, inplace=inplace)
+        # fmt: on
     destroy_handle(lib, handle)
 
 
diff --git a/test/infiniop/rms_norm.py b/test/infiniop/rms_norm.py
index 13cf1ccf9..21e27348e 100644
--- a/test/infiniop/rms_norm.py
+++ b/test/infiniop/rms_norm.py
@@ -20,12 +20,14 @@
 from operatorspy.tests.test_utils import get_args
 import torch
 
+
 class RMSNormDescriptor(Structure):
     _fields_ = [("device", c_int32)]
 
 
 infiniopRMSNormDescriptor_t = POINTER(RMSNormDescriptor)
 
+
 def rms_norm(x, w, eps):
     input_dtype = x.dtype
     hidden_states = x.to(torch.float32)
@@ -34,9 +36,20 @@ def rms_norm(x, w, eps):
     return w * hidden_states.to(input_dtype)
 
 
-def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float16, w_dtype=torch.float16):
-    print(f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
-        f" dtype:{dtype} w_dtype:{w_dtype}")
+def test(
+    lib,
+    handle,
+    torch_device,
+    y_shape,
+    x_shape,
+    w_shape,
+    dtype=torch.float16,
+    w_dtype=torch.float16,
+):
+    print(
+        f"Testing RMS_Norm on {torch_device} with y_shape:{y_shape} x_shape:{x_shape} w_shape:{w_shape}"
+        f" dtype:{dtype} w_dtype:{w_dtype}"
+    )
 
     y = torch.zeros(y_shape, dtype=dtype).to(torch_device)
     x = torch.rand(x_shape, dtype=dtype).to(torch_device)
@@ -50,12 +63,16 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
     w_tensor = to_tensor(w, lib)
 
     descriptor = infiniopRMSNormDescriptor_t()
-    w_dataType = 0 if w_dtype==torch.float16 else 1
+    w_dataType = 0 if w_dtype == torch.float16 else 1
 
     check_error(
         lib.infiniopCreateRMSNormDescriptor(
-            handle, ctypes.byref(descriptor), y_tensor.descriptor, x_tensor.descriptor,
-            w_tensor.descriptor, eps
+            handle,
+            ctypes.byref(descriptor),
+            y_tensor.descriptor,
+            x_tensor.descriptor,
+            w_tensor.descriptor,
+            eps,
         )
     )
 
@@ -66,9 +83,7 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
 
     workspace_size = c_uint64(0)
     check_error(
-        lib.infiniopGetRMSNormWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
+        lib.infiniopGetRMSNormWorkspaceSize(descriptor, ctypes.byref(workspace_size))
     )
     workspace = create_workspace(workspace_size.value, y.device)
     check_error(
@@ -86,37 +101,44 @@ def test(lib, handle, torch_device, y_shape, x_shape, w_shape, dtype=torch.float
     assert torch.allclose(y.to(dtype), ans.to(dtype), atol=1e-3, rtol=1e-3)
     check_error(lib.infiniopDestroyRMSNormDescriptor(descriptor))
 
+
 def test_cpu(lib, test_cases):
     device = DeviceEnum.DEVICE_CPU
     handle = create_handle(lib, device)
-    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+    for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases:
         test(lib, handle, "cpu", y_shape, x_shape, w_shape, dtype, w_dtype)
     destroy_handle(lib, handle)
 
+
 def test_cuda(lib, test_cases):
     device = DeviceEnum.DEVICE_CUDA
     handle = create_handle(lib, device)
-    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+    for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases:
         test(lib, handle, "cuda", y_shape, x_shape, w_shape, dtype, w_dtype)
     destroy_handle(lib, handle)
 
+
 def test_bang(lib, test_cases):
     import torch_mlu
+
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
-    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+    for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases:
         test(lib, handle, "mlu", y_shape, x_shape, w_shape, dtype, w_dtype)
     destroy_handle(lib, handle)
 
+
 def test_ascend(lib, test_cases):
     import torch_npu
+
     device = DeviceEnum.DEVICE_ASCEND
     handle = create_handle(lib, device)
-    for (y_shape, x_shape, w_shape, dtype, w_dtype) in test_cases:
+    for y_shape, x_shape, w_shape, dtype, w_dtype in test_cases:
         test(lib, handle, "npu", y_shape, x_shape, w_shape, dtype, w_dtype)
 
     destroy_handle(lib, handle)
 
+
 if __name__ == "__main__":
     test_cases = [
         # y_shape, x_shape, w_shape, dtype, w_dtype
diff --git a/test/infiniop/rotary_embedding.py b/test/infiniop/rotary_embedding.py
index 081d2f915..e4af9a57a 100644
--- a/test/infiniop/rotary_embedding.py
+++ b/test/infiniop/rotary_embedding.py
@@ -45,12 +45,13 @@ def rotary_embedding(t, pos, theta, torch_device):
     )
     freqs = torch.outer(pos, freqs)
     freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
-    
+
     t_ = torch.view_as_complex(t.reshape(*t.shape[:-1], -1, 2))
     freqs_cis = reshape_for_broadcast(freqs_cis, t_)
     t_out = torch.view_as_real(t_ * freqs_cis).flatten(2).to(t.dtype)
     return t_out
 
+
 def sin_cos_table(max_seq_len, dim, torch_device, theta):
     pos = torch.arange(
         0, max_seq_len, dtype=torch.float32, device=torch.device(torch_device)
@@ -73,12 +74,12 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
     if strides is not None:
         t = rearrange_tensor(t, strides)
     posTmp = torch.arange(0, t.shape[0])
-    pos = torch.zeros(2 * posTmp.shape[0], dtype = torch.int32)
+    pos = torch.zeros(2 * posTmp.shape[0], dtype=torch.int32)
     for i in range(posTmp.shape[0]):
         pos[2 * i] = posTmp[i]
         pos[2 * i + 1] = 0
     theta = 1e4
-    if torch_device == 'mlu' or torch_device == 'npu':
+    if torch_device == "mlu" or torch_device == "npu":
         ans = rotary_embedding(t, posTmp, theta, "cpu").to(torch_device)
         pos = pos.to(torch_device)
         t = t.to(torch_device)
@@ -97,7 +98,7 @@ def test(lib, handle, torch_device, shape, strides=None, dtype=torch.float16):
     cos_table_tensor = to_tensor(cos_table, lib)
 
     if torch_device == "npu":
-        torch.npu.synchronize() 
+        torch.npu.synchronize()
 
     check_error(
         lib.infiniopCreateRoPEDescriptor(
@@ -156,6 +157,7 @@ def test_cuda(lib, test_cases):
 
 def test_bang(lib, test_cases):
     import torch_mlu
+
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
     for shape, strides, dtype in test_cases:
@@ -163,7 +165,7 @@ def test_bang(lib, test_cases):
     destroy_handle(lib, handle)
 
 
-def test_ascend(lib, test_cases) :
+def test_ascend(lib, test_cases):
     import torch_npu
 
     device = DeviceEnum.DEVICE_ASCEND
@@ -172,6 +174,7 @@ def test_ascend(lib, test_cases) :
         test(lib, handle, "npu", shape, strides, dtype)
     destroy_handle(lib, handle)
 
+
 if __name__ == "__main__":
     test_cases = [
         ((1, 32, 128), None, torch.float16),
@@ -180,7 +183,6 @@ def test_ascend(lib, test_cases) :
         # 接口 GatherMask 的内部实现相关，目前 48 64 128 都可以支持
         ((4, 1, 32), None, torch.float16),
         ((1, 32, 128), None, torch.float16),
-        
         ((3, 32, 128), (8000, 200, 1), torch.float16),
     ]
     args = get_args()
diff --git a/test/infiniop/swiglu.py b/test/infiniop/swiglu.py
index 7fb447a1a..67b3c2b85 100644
--- a/test/infiniop/swiglu.py
+++ b/test/infiniop/swiglu.py
@@ -29,9 +29,10 @@ class SwiGLUDescriptor(Structure):
 
 
 def swiglu(a, b):
-    
+
     return a * b / (1 + torch.exp(-b.float()).to(b.dtype))
 
+
 def test_out_of_place(
     lib,
     handle,
@@ -223,6 +224,7 @@ def test_cuda(lib, test_cases):
 
 def test_bang(lib, test_cases):
     import torch_mlu
+
     device = DeviceEnum.DEVICE_BANG
     handle = create_handle(lib, device)
 
@@ -238,17 +240,30 @@ def test_bang(lib, test_cases):
 
 def test_ascend(lib, test_cases):
     import torch_npu
+
     device = DeviceEnum.DEVICE_ASCEND
     handle = create_handle(lib, device)
 
     for shape, a_stride, b_stride, c_stride, dtype in test_cases:
         test_out_of_place(
-            lib, handle, "npu", shape, a_stride, b_stride, c_stride, dtype, torch.npu.synchronize
+            lib,
+            handle,
+            "npu",
+            shape,
+            a_stride,
+            b_stride,
+            c_stride,
+            dtype,
+            torch.npu.synchronize,
+        )
+        test_in_place1(
+            lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize
+        )
+        test_in_place2(
+            lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize
         )
-        test_in_place1(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
-        test_in_place2(lib, handle, "npu", shape, a_stride, b_stride, dtype, torch.npu.synchronize)
 
-    destroy_handle(lib, handle) 
+    destroy_handle(lib, handle)
 
 
 if __name__ == "__main__":

From e5ed9fa172d78ac194bb3edc7105b8b7801a16dd Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Mon, 17 Feb 2025 18:52:59 +0800
Subject: [PATCH 5/5] =?UTF-8?q?issue/52:=20=E6=B7=BB=E5=8A=A0=20GitHub=20A?=
 =?UTF-8?q?ctions=20workflow?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 .github/workflows/build.yml | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 .github/workflows/build.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 000000000..bc65cd0ac
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,37 @@
+name: Build and test
+on:
+  pull_request:
+  push:
+    paths-ignore:
+      - '**.md'
+      - 'LICENSE'
+
+jobs:
+  build:
+    name: Build
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        type: [debug, release]
+    steps:
+
+    - name: checkout code
+      uses: actions/checkout@v4
+
+    - name: install black
+      run: pip install black
+
+    - name: check format
+      run: python3 scripts/format.py --path src --check
+
+    - name: install xmake
+      uses: xmake-io/github-action-setup-xmake@v1
+      with:
+        xmake-version: latest
+
+    - name: configure xmake
+      run: xmake f -cv
+
+    - name: build with xmake
+      run: xmake build && xmake install