-
Notifications
You must be signed in to change notification settings - Fork 0
feat(ops): add RmsNorm with Iluvatar, NVIDIA, CPU backends and fp16/bf16 support
#6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: feat/dev-infra
Are you sure you want to change the base?
Changes from all commits
377ec59
10187f4
899d46c
7f7ea45
e650290
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,18 +41,33 @@ endif() | |
|
|
||
| include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) | ||
|
|
||
| # NVIDIA and Iluvatar are parallel backends; only one GPU backend at a time. | ||
| if(WITH_NVIDIA AND WITH_ILUVATAR) | ||
| message(FATAL_ERROR "WITH_NVIDIA and WITH_ILUVATAR cannot both be ON. Build one GPU backend at a time.") | ||
| endif() | ||
|
|
||
| if(WITH_NVIDIA) | ||
| add_compile_definitions(WITH_NVIDIA=1) | ||
| enable_language(CUDA) | ||
| find_package(CUDAToolkit REQUIRED) | ||
| endif() | ||
|
|
||
| # Iluvatar: CUDA-compatible device, uses clang++ with -x ivcore (not nvcc). | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 使用 Markdown 语法: |
||
| # Reference: InfiniCore xmake/iluvatar.lua | ||
| if(WITH_ILUVATAR) | ||
| add_compile_definitions(WITH_ILUVATAR=1) | ||
| if(NOT WITH_NVIDIA) | ||
| enable_language(CUDA) | ||
| find_package(CUDAToolkit REQUIRED) | ||
| set(ILUVATAR_ARCH "ivcore20" CACHE STRING "Iluvatar GPU architecture") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 天数上我开发的不太多,但是我记得之前开发 |
||
| find_program(CLANGXX NAMES clang++) | ||
| if(CLANGXX) | ||
| set(CMAKE_CUDA_COMPILER "${CLANGXX}" CACHE STRING "Iluvatar CUDA compiler (clang++)") | ||
| else() | ||
| set(CMAKE_CUDA_COMPILER "clang++" CACHE STRING "Iluvatar CUDA compiler (clang++)") | ||
| endif() | ||
| set(CMAKE_CUDA_FLAGS "-x ivcore -std=c++17 --cuda-gpu-arch=${ILUVATAR_ARCH} -fPIC -Wno-error=unused-variable -Wno-error=unused-private-field -Wno-unused-variable" CACHE STRING "Iluvatar CUDA flags") | ||
| set(CMAKE_CUDA_SEPARABLE_COMPILATION OFF CACHE BOOL "Disable RDC for Iluvatar") | ||
| message(STATUS "Iluvatar: CUDA compiler ${CMAKE_CUDA_COMPILER}, arch ${ILUVATAR_ARCH}") | ||
| enable_language(CUDA) | ||
| find_package(CUDAToolkit REQUIRED) | ||
| endif() | ||
|
|
||
| if(WITH_METAX) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -72,27 +72,25 @@ | |
| """ | ||
|
|
||
|
|
||
| class _OperatorExtractor: | ||
| def __call__(self, op_name): | ||
| def _get_system_include_flags(): | ||
| system_include_flags = [] | ||
|
|
||
| for line in subprocess.getoutput( | ||
| "clang++ -E -x c++ -v /dev/null" | ||
| ).splitlines(): | ||
| if not line.startswith(" "): | ||
| continue | ||
|
|
||
| system_include_flags.append("-isystem") | ||
| system_include_flags.append(line.strip()) | ||
| def _get_system_include_flags(): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个 helper 还是先放回原位吧,一个原因是它应该只是在 |
||
| system_include_flags = [] | ||
| for line in subprocess.getoutput( | ||
| "clang++ -E -x c++ -v /dev/null" | ||
| ).splitlines(): | ||
| if not line.startswith(" "): | ||
| continue | ||
| system_include_flags.append("-isystem") | ||
| system_include_flags.append(line.strip()) | ||
| return system_include_flags | ||
|
|
||
| return system_include_flags | ||
|
|
||
| class _OperatorExtractor: | ||
| def __call__(self, op_name, base_stem=None): | ||
| system_include_flags = _get_system_include_flags() | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这些原有的空行能不动就先不动,最小化更改,还原一下,下面应当也有几处只减少了空行的地方,同理处理即可。 |
||
| index = clang.cindex.Index.create() | ||
| args = ("-std=c++17", "-x", "c++", "-I", "src") + tuple(system_include_flags) | ||
| translation_unit = index.parse(f"src/base/{op_name.lower()}.h", args=args) | ||
| header = f"src/base/{(base_stem or op_name.lower())}.h" | ||
| translation_unit = index.parse(header, args=args) | ||
|
|
||
| nodes = tuple(type(self)._find(translation_unit.cursor, op_name)) | ||
|
|
||
|
|
@@ -105,7 +103,8 @@ def _get_system_include_flags(): | |
| elif node.kind == CursorKind.CXX_METHOD and node.spelling == "operator()": | ||
| calls.append(node) | ||
|
|
||
| return _Operator(op_name, constructors, calls) | ||
| header_name = base_stem if base_stem is not None else op_name.lower() | ||
| return _Operator(op_name, constructors, calls, header_name=header_name) | ||
|
|
||
| @staticmethod | ||
| def _find(node, op_name): | ||
|
|
@@ -117,12 +116,34 @@ def _find(node, op_name): | |
|
|
||
|
|
||
| class _Operator: | ||
| def __init__(self, name, constructors, calls): | ||
| def __init__(self, name, constructors, calls, header_name=None): | ||
| self.name = name | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同上。 |
||
| self.constructors = constructors | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同上。 |
||
| self.calls = calls | ||
| self.header_name = header_name if header_name is not None else name.lower() | ||
|
|
||
|
|
||
| def _make_mock_node(params): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 关于 |
||
| """Create a mock node with get_arguments() for manual operator specs.""" | ||
|
|
||
| class _Type: | ||
| def __init__(self, spelling): | ||
| self.spelling = spelling | ||
|
|
||
| class _Arg: | ||
| def __init__(self, type_spelling, name): | ||
| self.type = _Type(type_spelling) | ||
| self.spelling = name | ||
|
|
||
| class _MockNode: | ||
| def get_arguments(self): | ||
| return [_Arg(typ, name) for typ, name in params] | ||
|
|
||
| return _MockNode() | ||
|
|
||
|
|
||
| # Operators that fail libclang parse; provide manual spec for wrapper generation. | ||
| _MANUAL_OP_SPECS = {} | ||
|
|
||
|
|
||
| def _generate_pybind11(operator): | ||
|
|
@@ -135,6 +156,8 @@ def _generate_params(node): | |
| ) | ||
| .replace("const Tensor", "py::object") | ||
| .replace("Tensor", "py::object") | ||
| .replace("std::optional<float>", "float") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| .replace("std::optional<int>", "bool") | ||
| ) | ||
|
|
||
| def _generate_arguments(node): | ||
|
|
@@ -173,7 +196,8 @@ def _generate_call(op_name, call, method=True): | |
| ) | ||
| calls = "\n".join(_generate_call(operator.name, call) for call in operator.calls) | ||
| callers = "\n".join( | ||
| _generate_call(operator.name, call, method=False) for call in operator.calls | ||
| _generate_call(operator.header_name, call, method=False) | ||
| for call in operator.calls | ||
| ) | ||
|
|
||
| return f"""#ifndef INFINI_OPS_BINDINGS_{op_name.upper()}_H_ | ||
|
|
@@ -182,7 +206,7 @@ def _generate_call(op_name, call, method=True): | |
| #include <pybind11/pybind11.h> | ||
| #include <pybind11/stl.h> | ||
|
|
||
| #include "base/{op_name.lower()}.h" | ||
| #include "base/{operator.header_name}.h" | ||
| #include "utils.h" | ||
|
|
||
| namespace py = pybind11; | ||
|
|
@@ -213,7 +237,7 @@ def _generate_source(operator): | |
|
|
||
| return f"""#include "../../handle.h" | ||
| #include "../../tensor.h" | ||
| #include "infiniop/ops/{operator.name.lower()}.h" | ||
| #include "infiniop/ops/{operator.header_name}.h" | ||
| {impl_includes} | ||
|
|
||
| static infini::ops::DataType DataTypeFromInfiniDType( | ||
|
|
@@ -270,7 +294,7 @@ def _generate_header(operator): | |
| return f"""#ifndef __INFINIOP_{operator.name.upper()}_API_H__ | ||
| #define __INFINIOP_{operator.name.upper()}_API_H__ | ||
|
|
||
| #include "base/{operator.name.lower()}.h" | ||
| #include "base/{operator.header_name}.h" | ||
|
|
||
| typedef struct infini::ops::Operator<infini::ops::{operator.name}> *infiniop{operator.name}Descriptor_t; | ||
|
|
||
|
|
@@ -382,20 +406,21 @@ def _generate_tensor_caster(name, is_data=False): | |
| def _get_all_ops(devices): | ||
| ops = {} | ||
|
|
||
| for file_path in _BASE_DIR.iterdir(): | ||
| if not file_path.is_file(): | ||
| for base_file in _BASE_DIR.iterdir(): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 把 |
||
| if not base_file.is_file(): | ||
| continue | ||
|
|
||
| op_name = "".join(word.capitalize() for word in file_path.stem.split("_")) | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同上。 |
||
| ops[op_name] = [] | ||
| op_name = "".join(word.capitalize() for word in base_file.stem.split("_")) | ||
| impl_paths = [] | ||
|
|
||
| for file_path in _SRC_DIR.rglob("*"): | ||
| if not file_path.is_file() or file_path.parent.parent.name not in devices: | ||
| for impl_path in _SRC_DIR.rglob("*"): | ||
| if not impl_path.is_file() or impl_path.parent.parent.name not in devices: | ||
| continue | ||
|
|
||
| if f"class Operator<{op_name}" in file_path.read_text(): | ||
| ops[op_name].append(file_path) | ||
| if f"class Operator<{op_name}" in impl_path.read_text(): | ||
| impl_paths.append(impl_path) | ||
|
|
||
| ops[op_name] = {"base_stem": base_file.stem, "impl_paths": impl_paths} | ||
|
|
||
| return ops | ||
|
|
||
|
|
@@ -429,12 +454,37 @@ def _get_all_ops(devices): | |
|
|
||
| (_BINDINGS_DIR / "utils.h").write_text(_UTILS_H_CONTENT) | ||
|
|
||
| for op_name, impl_paths in ops.items(): | ||
| extractor = _OperatorExtractor() | ||
| operator = extractor(op_name) | ||
| valid_ops = {} | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同上。 |
||
| for op_name, op_data in ops.items(): | ||
| base_stem = op_data.get("base_stem") if isinstance(op_data, dict) else None | ||
| impl_paths = ( | ||
| op_data.get("impl_paths", op_data) | ||
| if isinstance(op_data, dict) | ||
| else op_data | ||
| ) | ||
|
|
||
| operator = None | ||
| if op_name in _MANUAL_OP_SPECS: | ||
| spec = _MANUAL_OP_SPECS[op_name] | ||
| operator = _Operator( | ||
| op_name, | ||
| constructors=[_make_mock_node(spec["constructor"])], | ||
| calls=[_make_mock_node(spec["call"])], | ||
| header_name=spec.get("header"), | ||
| ) | ||
| else: | ||
| extractor = _OperatorExtractor() | ||
| try: | ||
| operator = extractor(op_name, base_stem=base_stem) | ||
| except clang.cindex.TranslationUnitLoadError as e: | ||
| print( | ||
| f"Warning: Skipping {op_name} - failed to parse base header: {e}" | ||
| ) | ||
| continue | ||
|
|
||
| valid_ops[op_name] = impl_paths | ||
| source_path = _GENERATED_SRC_DIR / op_name.lower() | ||
| header_name = f"{op_name.lower()}.h" | ||
| header_name = f"{operator.header_name}.h" | ||
| bind_func_name = f"Bind{op_name}" | ||
|
|
||
| (_BINDINGS_DIR / header_name).write_text(_generate_pybind11(operator)) | ||
|
|
@@ -451,15 +501,20 @@ def _get_all_ops(devices): | |
|
|
||
| impl_includes = "\n".join( | ||
| f'#include "{impl_path}"' | ||
| for impl_paths in ops.values() | ||
| for impl_paths in valid_ops.values() | ||
| for impl_path in impl_paths | ||
| ) | ||
| op_includes = "\n".join(f'#include "{header_path}"' for header_path in header_paths) | ||
| bind_func_calls = "\n".join( | ||
| f"{bind_func_name}(m);" for bind_func_name in bind_func_names | ||
| ) | ||
|
|
||
| (_BINDINGS_DIR / "ops.cc").write_text(f"""#include <pybind11/pybind11.h> | ||
| has_cuda_impl = any( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 目前没有 |
||
| str(p).endswith(".cu") for impls in valid_ops.values() for p in impls | ||
| ) | ||
| ops_source = "ops.cu" if has_cuda_impl else "ops.cc" | ||
|
|
||
| (_BINDINGS_DIR / ops_source).write_text(f"""#include <pybind11/pybind11.h> | ||
|
|
||
| // clang-format off | ||
| {impl_includes} | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,8 +43,11 @@ if(WITH_NVIDIA) | |
| target_link_libraries(infiniops PUBLIC CUDA::cudart CUDA::cublas CUDA::cuda_driver) | ||
|
|
||
| list(APPEND DEVICE_LIST "nvidia") | ||
| set_target_properties(infiniops PROPERTIES CUDA_STANDARD 17 | ||
| CUDA_STANDARD_REQUIRED ON) | ||
| endif() | ||
|
|
||
| # Iluvatar: CUDA-compatible device; -x ivcore and flags from top-level CMakeLists.txt | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这行注释可以去掉。 |
||
| if(WITH_ILUVATAR) | ||
| set(ILUVATAR_PATTERNS | ||
| "cuda/*.cc" | ||
|
|
@@ -65,6 +68,9 @@ if(WITH_ILUVATAR) | |
| find_package(CUDAToolkit REQUIRED) | ||
| target_link_libraries(infiniops PUBLIC CUDA::cudart CUDA::cublas CUDA::cuda_driver) | ||
|
|
||
| set_target_properties(infiniops PROPERTIES CUDA_STANDARD 17 | ||
| CUDA_STANDARD_REQUIRED ON) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
|
||
| list(APPEND DEVICE_LIST "iluvatar") | ||
| endif() | ||
|
|
||
|
|
@@ -112,7 +118,7 @@ if(GENERATE_PYTHON_BINDINGS) | |
| set(PYBIND11_SOURCES "${PROJECT_SOURCE_DIR}/generated/bindings/ops.cc") | ||
|
|
||
| # TODO: There might be a better solution. | ||
| if(WITH_NVIDIA) | ||
| if(WITH_NVIDIA OR WITH_ILUVATAR) | ||
| set_source_files_properties(${PYBIND11_SOURCES} PROPERTIES LANGUAGE CUDA) | ||
| endif() | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,6 +36,12 @@ class Gemm : public Operator<Gemm> { | |
| // TODO: Check constraints. | ||
| } | ||
|
|
||
| Gemm(const Tensor a, const Tensor b, float alpha, float beta, bool trans_a, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个是必须引入的嘛,可以测试一下,因为我记得传 |
||
| bool trans_b, Tensor c) | ||
| : Gemm{a, b, std::optional<float>(alpha), std::optional<float>(beta), | ||
| std::optional<int>(static_cast<int>(trans_a)), | ||
| std::optional<int>(static_cast<int>(trans_b)), c} {} | ||
|
|
||
| Gemm(const Tensor a, const Tensor b, Tensor c) | ||
| : Gemm{a, b, std::nullopt, std::nullopt, std::nullopt, std::nullopt, c} {} | ||
|
|
||
|
|
@@ -44,6 +50,15 @@ class Gemm : public Operator<Gemm> { | |
| std::optional<int> trans_a, | ||
| std::optional<int> trans_b, Tensor c) const = 0; | ||
|
|
||
| virtual void operator()(void* stream, const Tensor a, const Tensor b, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同上。 |
||
| float alpha, float beta, bool trans_a, bool trans_b, | ||
| Tensor c) const { | ||
| return operator()(stream, a, b, std::optional<float>(alpha), | ||
| std::optional<float>(beta), | ||
| std::optional<int>(static_cast<int>(trans_a)), | ||
| std::optional<int>(static_cast<int>(trans_b)), c); | ||
| } | ||
|
|
||
| virtual void operator()(void* stream, const Tensor a, const Tensor b, | ||
| Tensor c) const { | ||
| return operator()(stream, a, b, std::nullopt, std::nullopt, std::nullopt, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| #ifndef INFINI_OPS_BASE_RMS_NORM_H_ | ||
| #define INFINI_OPS_BASE_RMS_NORM_H_ | ||
|
|
||
| #include <cstddef> | ||
| #include <vector> | ||
|
|
||
| #include "operator.h" | ||
| #include "tensor.h" | ||
|
|
||
| namespace infini::ops { | ||
|
|
||
| class RmsNorm : public Operator<RmsNorm> { | ||
| public: | ||
| RmsNorm(const Tensor y, const Tensor x, const Tensor w, float epsilon) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 命名应当遵循贡献指南,因为 PyTorch 中有
|
||
| : epsilon_{epsilon}, | ||
| y_shape_{y.shape()}, | ||
| x_shape_{x.shape()}, | ||
| y_strides_{y.strides()}, | ||
| x_strides_{x.strides()}, | ||
| dim_{y.size(-1)}, | ||
| ndim_{y.ndim()}, | ||
| batch_size_{ndim_ == 2 ? y.size(-2) : y.size(-3)}, | ||
| nhead_{ndim_ == 2 ? 1 : y.size(-2)} {} | ||
|
|
||
| RmsNorm(const Tensor y, const Tensor x, const Tensor w) | ||
| : RmsNorm{y, x, w, 1e-6f} {} | ||
|
|
||
| virtual void operator()(void* stream, Tensor y, const Tensor x, | ||
| const Tensor w, float epsilon) const = 0; | ||
|
|
||
| virtual void operator()(void* stream, Tensor y, const Tensor x, | ||
| const Tensor w) const { | ||
| return operator()(stream, y, x, w, epsilon_); | ||
| } | ||
|
|
||
| protected: | ||
| float epsilon_{1e-6f}; | ||
|
|
||
| Tensor::Shape y_shape_; | ||
|
|
||
| Tensor::Shape x_shape_; | ||
|
|
||
| Tensor::Strides y_strides_; | ||
|
|
||
| Tensor::Strides x_strides_; | ||
|
|
||
| Tensor::Size dim_{0}; | ||
|
|
||
| Tensor::Size ndim_{0}; | ||
|
|
||
| Tensor::Size batch_size_{0}; | ||
|
|
||
| Tensor::Size nhead_{1}; | ||
| }; | ||
|
|
||
| } // namespace infini::ops | ||
|
|
||
| #endif | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,8 @@ | |
|
|
||
| #ifdef WITH_NVIDIA | ||
| #include <cuda_runtime.h> | ||
| #elif defined(WITH_ILUVATAR) | ||
| #include <cuda_runtime.h> | ||
| #elif WITH_METAX | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 如果有空的话,可以帮忙在这一行加一行 comment: |
||
| #include <mcr/mc_runtime.h> | ||
| #endif | ||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
使用 Markdown 语法:
"`WITH_NVIDIA` and `WITH_ILUVATAR` cannot both be `ON`. Build one GPU backend at a time."。