diff --git a/CMakeLists.txt b/CMakeLists.txt index f41255273..49ddcda61 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ message(STATUS "Project " ${PROJECT_NAME} " version " ${PROJECT_VERSION}) option(ABSL_PROPAGATE_CXX_STD "Abseil need this option" ON) option(USE_CUDA "Support Nvidia GPU" OFF) option(USE_KUNLUN "Support Baidu Kunlunxin" OFF) +option(USE_BANG "Support Hanwuji MLU" OFF) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -41,6 +42,38 @@ if(USE_KUNLUN) message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}") endif() +if (USE_BANG) + add_compile_definitions(USE_BANG) + include_directories(src/kernels/mlu/include) + + # Neuware Evironment + if ((NOT DEFINED NEUWARE_HOME) AND (NOT DEFINED ENV{NEUWARE_HOME})) + message(FATAL_ERROR "NEUWARE_HOME is not defined from cmake or env") + elseif (DEFINED NEUWARE_HOME) + set(NEUWARE_HOME ${NEUWARE_HOME} CACHE STRING "NEUWARE_HOME directory for Cambricon Neuware development") + else() + set(NEUWARE_HOME $ENV{NEUWARE_HOME} CACHE STRING "NEUWARE_HOME directory for Cambricon Neuware development") + endif() + message(STATUS "NEUWARE_HOME: ${NEUWARE_HOME}") + + # cnrt cndrv cnnl + include_directories("${NEUWARE_HOME}/include") + find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64") + find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64") + find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall") + + if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH})) + execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE) + set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH") + elseif(DEFINED TARGET_CPU_ARCH) + set(TARGET_CPU_ARCH ${TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH") + else() + set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH") + endif() + message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}") +endif() + add_compile_options(-march=native) # this will cause error in some machine add_compile_options(-mtune=native) add_compile_options(-Wall) diff --git a/Makefile b/Makefile index cf3e402ce..482087658 100644 --- a/Makefile +++ b/Makefile @@ -3,13 +3,14 @@ TYPE ?= Debug CUDA ?= OFF KUNLUN ?= OFF +BANG ?= OFF CMAKE_EXTRA = # CMAKE_EXTRA += -DCMAKE_CXX_COMPILER= build: mkdir -p build - cmake -Bbuild -DCMAKE_BUILD_TYPE=$(TYPE) -DUSE_CUDA=$(CUDA) -DUSE_KUNLUN=$(KUNLUN) $(CMAKE_EXTRA) + cmake -Bbuild -DCMAKE_BUILD_TYPE=$(TYPE) -DUSE_CUDA=$(CUDA) -DUSE_KUNLUN=$(KUNLUN) -DUSE_BANG=$(BANG) $(CMAKE_EXTRA) make -j -C build install-python: build diff --git a/src/02hardware/include/hardware/device.h b/src/02hardware/include/hardware/device.h index 335c759bf..91e5c4509 100644 --- a/src/02hardware/include/hardware/device.h +++ b/src/02hardware/include/hardware/device.h @@ -11,6 +11,8 @@ namespace refactor::hardware { enum class Type : int32_t { Cpu, Nvidia, + Mlu, + Kunlun, }; protected: diff --git a/src/02hardware/include/hardware/devices/mlu.h b/src/02hardware/include/hardware/devices/mlu.h new file mode 100644 index 000000000..aa3b76529 --- /dev/null +++ b/src/02hardware/include/hardware/devices/mlu.h @@ -0,0 +1,19 @@ +#ifndef HARDWARE_DEVICES_MLU_H +#define HARDWARE_DEVICES_MLU_H + +#include "../device.h" + +namespace refactor::hardware { + + class Mlu final : public Device { + public: + explicit Mlu(int32_t card); + void setContext() const noexcept final; + Type type() const noexcept final { + return Type::Mlu; + } + }; + +}// namespace refactor::hardware + +#endif// HARDWARE_DEVICES_MLU_H diff --git a/src/02hardware/src/device_manager.cpp b/src/02hardware/src/device_manager.cpp index bcfab3bb8..15ae3b901 100644 --- a/src/02hardware/src/device_manager.cpp +++ b/src/02hardware/src/device_manager.cpp @@ -1,5 +1,6 @@ #include "hardware/device_manager.h" #include "hardware/devices/cpu.h" +#include "hardware/devices/mlu.h" #include "hardware/devices/nvidia.h" namespace refactor::hardware::device { @@ -37,6 +38,7 @@ namespace refactor::hardware::device { using T = Device::Type; // clang-format off auto device = type == T::Nvidia ? std::make_shared(card) + : type == T::Mlu ? std::make_shared(card) : UNREACHABLEX(Arc, ""); // clang-format on auto [kind, ok] = DEVICES.try_emplace(static_cast(type)); diff --git a/src/02hardware/src/devices/cpu/memory.cc b/src/02hardware/src/devices/cpu/memory.cc index 4db56e98e..dbd64f51e 100644 --- a/src/02hardware/src/devices/cpu/memory.cc +++ b/src/02hardware/src/devices/cpu/memory.cc @@ -5,19 +5,19 @@ namespace refactor::hardware { using M = CpuMemory; - void *M::malloc(size_t size) noexcept { + void *M::malloc(size_t size) { return std::malloc(size); } - void M::free(void *ptr) noexcept { + void M::free(void *ptr) { std::free(ptr); } - void *M::copyHD(void *dst, void const *src, size_t bytes) const noexcept { + void *M::copyHD(void *dst, void const *src, size_t bytes) const { return std::memcpy(dst, src, bytes); } - void *M::copyDH(void *dst, void const *src, size_t bytes) const noexcept { + void *M::copyDH(void *dst, void const *src, size_t bytes) const { return std::memcpy(dst, src, bytes); } - void *M::copyDD(void *dst, void const *src, size_t bytes) const noexcept { + void *M::copyDD(void *dst, void const *src, size_t bytes) const { return std::memcpy(dst, src, bytes); } diff --git a/src/02hardware/src/devices/cpu/memory.hh b/src/02hardware/src/devices/cpu/memory.hh index 5bd3a1dda..d1681b24d 100644 --- a/src/02hardware/src/devices/cpu/memory.hh +++ b/src/02hardware/src/devices/cpu/memory.hh @@ -6,11 +6,11 @@ namespace refactor::hardware { class CpuMemory final : public Memory { - void *malloc(size_t) noexcept final; - void free(void *) noexcept final; - void *copyHD(void *dst, void const *src, size_t bytes) const noexcept final; - void *copyDH(void *dst, void const *src, size_t bytes) const noexcept final; - void *copyDD(void *dst, void const *src, size_t bytes) const noexcept final; + void *malloc(size_t) final; + void free(void *) final; + void *copyHD(void *dst, void const *src, size_t bytes) const final; + void *copyDH(void *dst, void const *src, size_t bytes) const final; + void *copyDD(void *dst, void const *src, size_t bytes) const final; }; }// namespace refactor::hardware diff --git a/src/02hardware/src/devices/mlu/device.cc b/src/02hardware/src/devices/mlu/device.cc new file mode 100644 index 000000000..ea1f6affd --- /dev/null +++ b/src/02hardware/src/devices/mlu/device.cc @@ -0,0 +1,31 @@ +#include "functions.hh" +#include "hardware/devices/mlu.h" +#include "hardware/mem_pool.h" +#include "memory.hh" + +namespace refactor::hardware { + + static Arc bangMemory(int32_t card) { +#ifdef USE_BANG + ASSERT(0 <= card && card < getDeviceCount(), "Invalid card id: {}", card); + setDevice(card); + auto [free, total] = getMemInfo(); + auto size = std::min(free, std::max(5ul << 30, total * 4 / 5)); + fmt::println("initializing Cambricon MLU {}, memory {} / {}, alloc {}", + card, free, total, size); + return std::make_shared( + std::make_shared(), + size, + 256ul); +#else + return nullptr; +#endif + } + + Mlu::Mlu(int32_t card) : Device(card, bangMemory(card)) {} + + void Mlu::setContext() const noexcept { + setDevice(_card); + } + +}// namespace refactor::hardware diff --git a/src/02hardware/src/devices/mlu/functions.cc b/src/02hardware/src/devices/mlu/functions.cc new file mode 100644 index 000000000..bedea0458 --- /dev/null +++ b/src/02hardware/src/devices/mlu/functions.cc @@ -0,0 +1,21 @@ +#include "functions.hh" + +namespace refactor::hardware { + +#ifdef USE_BANG + int getDeviceCount() { + unsigned deviceCount; + BANG_ASSERT(cnrtGetDeviceCount(&deviceCount)); + return static_cast(deviceCount); + } + void setDevice(int device) { + BANG_ASSERT(cnrtSetDevice(device)); + } + MemInfo getMemInfo() { + MemInfo memInfo; + BANG_ASSERT(cnrtMemGetInfo(&memInfo.free, &memInfo.total)); + return memInfo; + } +#endif + +}// namespace refactor::hardware diff --git a/src/02hardware/src/devices/mlu/functions.hh b/src/02hardware/src/devices/mlu/functions.hh new file mode 100644 index 000000000..f12faab4b --- /dev/null +++ b/src/02hardware/src/devices/mlu/functions.hh @@ -0,0 +1,28 @@ +#ifndef HARDWARE_DEVICES_MLU_FUNCTIONS_CUH +#define HARDWARE_DEVICES_MLU_FUNCTIONS_CUH + +#include "common.h" + +#ifdef USE_BANG +#include "cnrt.h" + +#define BANG_ASSERT(STATUS) \ + if (auto status = (STATUS); status != CNRT_RET_SUCCESS) { \ + RUNTIME_ERROR(fmt::format("bang failed on \"" #STATUS "\" with \"{}\" ({})", \ + cnrtGetErrorStr(status), (int) status)); \ + } +#endif + +namespace refactor::hardware { + + struct MemInfo { + size_t free, total; + }; + + int getDeviceCount(); + void setDevice(int device); + MemInfo getMemInfo(); + +}// namespace refactor::hardware + +#endif// HARDWARE_DEVICES_NVIDIA_FUNCTIONS_CUH diff --git a/src/02hardware/src/devices/mlu/memory.cc b/src/02hardware/src/devices/mlu/memory.cc new file mode 100644 index 000000000..55550314a --- /dev/null +++ b/src/02hardware/src/devices/mlu/memory.cc @@ -0,0 +1,33 @@ +#include "memory.hh" +#include "functions.hh" + +namespace refactor::hardware { +#ifdef USE_BANG + using M = MluMemory; + + void *M::malloc(size_t size) { + void *ptr; + BANG_ASSERT(cnrtMalloc(&ptr, size)); + return ptr; + } + void M::free(void *ptr) { + BANG_ASSERT(cnrtFree(ptr)); + } + void *M::copyHD(void *dst, void const *src, size_t bytes) const { + BANG_ASSERT(cnrtMemcpy(dst, const_cast(src), bytes, + CNRT_MEM_TRANS_DIR_HOST2DEV)) + return dst; + } + void *M::copyDH(void *dst, void const *src, size_t bytes) const { + BANG_ASSERT(cnrtMemcpy(dst, const_cast(src), bytes, + CNRT_MEM_TRANS_DIR_DEV2HOST)); + return dst; + } + void *M::copyDD(void *dst, void const *src, size_t bytes) const { + BANG_ASSERT(cnrtMemcpy(dst, const_cast(src), bytes, + CNRT_MEM_TRANS_DIR_PEER2PEER)); + return dst; + } +#endif + +}// namespace refactor::hardware diff --git a/src/02hardware/src/devices/mlu/memory.hh b/src/02hardware/src/devices/mlu/memory.hh new file mode 100644 index 000000000..85ec39887 --- /dev/null +++ b/src/02hardware/src/devices/mlu/memory.hh @@ -0,0 +1,18 @@ +#ifndef HARDWARE_DEVICES_MLU_MEMORY_CUH +#define HARDWARE_DEVICES_MLU_MEMORY_CUH + +#include "hardware/memory.h" + +namespace refactor::hardware { + + class MluMemory final : public Memory { + void *malloc(size_t) final; + void free(void *) final; + void *copyHD(void *dst, void const *src, size_t bytes) const final; + void *copyDH(void *dst, void const *src, size_t bytes) const final; + void *copyDD(void *dst, void const *src, size_t bytes) const final; + }; + +}// namespace refactor::hardware + +#endif// HARDWARE_DEVICES_MLU_MEMORY_HH diff --git a/src/02hardware/src/devices/nvidia/device.cc b/src/02hardware/src/devices/nvidia/device.cc index a5dc71631..7b25cc1f8 100644 --- a/src/02hardware/src/devices/nvidia/device.cc +++ b/src/02hardware/src/devices/nvidia/device.cc @@ -34,7 +34,7 @@ namespace refactor::hardware { size, alignment); #else - RUNTIME_ERROR("CUDA is not enabled"); + return nullptr; #endif } diff --git a/src/04kernel/CMakeLists.txt b/src/04kernel/CMakeLists.txt index 75e7e50fe..3a401ac35 100644 --- a/src/04kernel/CMakeLists.txt +++ b/src/04kernel/CMakeLists.txt @@ -35,6 +35,9 @@ if(USE_KUNLUN) find_library(KUNLUN_DNN libxpuapi.so ${KUNLUN_HOME}/XTDK/shlib) target_link_libraries(kernel PUBLIC ${KUNLUN_RT} ${KUNLUN_DNN}) endif() +if(USE_BANG) + target_link_libraries(kernel PUBLIC ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV}) +endif() file(GLOB_RECURSE KERNEL_TEST test/*.cpp) if(KERNEL_TEST)