diff --git a/CMakeLists.txt b/CMakeLists.txt
index f41255273..49ddcda61 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,7 @@ message(STATUS "Project " ${PROJECT_NAME} " version " ${PROJECT_VERSION})
 option(ABSL_PROPAGATE_CXX_STD "Abseil need this option" ON)
 option(USE_CUDA "Support Nvidia GPU" OFF)
 option(USE_KUNLUN "Support Baidu Kunlunxin" OFF)
+option(USE_BANG "Support Hanwuji MLU" OFF)
 
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -41,6 +42,38 @@ if(USE_KUNLUN)
     message(STATUS "KUNLUN_HOME: ${KUNLUN_HOME}")
 endif()
 
+if (USE_BANG)
+    add_compile_definitions(USE_BANG)
+    include_directories(src/kernels/mlu/include)
+    
+    # Neuware Evironment
+    if ((NOT DEFINED NEUWARE_HOME) AND (NOT DEFINED ENV{NEUWARE_HOME}))
+        message(FATAL_ERROR "NEUWARE_HOME is not defined from cmake or env")
+    elseif (DEFINED NEUWARE_HOME)
+        set(NEUWARE_HOME ${NEUWARE_HOME} CACHE STRING "NEUWARE_HOME directory for Cambricon Neuware development")
+    else()
+        set(NEUWARE_HOME $ENV{NEUWARE_HOME} CACHE STRING "NEUWARE_HOME directory for Cambricon Neuware development")
+    endif()
+    message(STATUS "NEUWARE_HOME: ${NEUWARE_HOME}")
+    
+    # cnrt cndrv cnnl
+    include_directories("${NEUWARE_HOME}/include")
+    find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
+    find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
+    find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall")
+  
+    if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
+        execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
+        set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
+    elseif(DEFINED TARGET_CPU_ARCH)
+        set(TARGET_CPU_ARCH ${TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
+    else()
+        set(TARGET_CPU_ARCH $ENV{TARGET_CPU_ARCH} CACHE STRING "Target CPU ARCH")
+    endif()
+    message(STATUS "TARGET_CPU_ARCH: ${TARGET_CPU_ARCH}")
+endif()
+
 add_compile_options(-march=native) # this will cause error in some machine
 add_compile_options(-mtune=native)
 add_compile_options(-Wall)
diff --git a/Makefile b/Makefile
index cf3e402ce..482087658 100644
--- a/Makefile
+++ b/Makefile
@@ -3,13 +3,14 @@
 TYPE ?= Debug
 CUDA ?= OFF
 KUNLUN ?= OFF
+BANG ?= OFF
 
 CMAKE_EXTRA =
 # CMAKE_EXTRA += -DCMAKE_CXX_COMPILER=
 
 build:
 	mkdir -p build
-	cmake -Bbuild -DCMAKE_BUILD_TYPE=$(TYPE) -DUSE_CUDA=$(CUDA) -DUSE_KUNLUN=$(KUNLUN) $(CMAKE_EXTRA)
+	cmake -Bbuild -DCMAKE_BUILD_TYPE=$(TYPE) -DUSE_CUDA=$(CUDA) -DUSE_KUNLUN=$(KUNLUN) -DUSE_BANG=$(BANG) $(CMAKE_EXTRA)
 	make -j -C build
 
 install-python: build
diff --git a/src/02hardware/include/hardware/device.h b/src/02hardware/include/hardware/device.h
index 335c759bf..91e5c4509 100644
--- a/src/02hardware/include/hardware/device.h
+++ b/src/02hardware/include/hardware/device.h
@@ -11,6 +11,8 @@ namespace refactor::hardware {
         enum class Type : int32_t {
             Cpu,
             Nvidia,
+            Mlu,
+            Kunlun,
         };
 
     protected:
diff --git a/src/02hardware/include/hardware/devices/mlu.h b/src/02hardware/include/hardware/devices/mlu.h
new file mode 100644
index 000000000..aa3b76529
--- /dev/null
+++ b/src/02hardware/include/hardware/devices/mlu.h
@@ -0,0 +1,19 @@
+#ifndef HARDWARE_DEVICES_MLU_H
+#define HARDWARE_DEVICES_MLU_H
+
+#include "../device.h"
+
+namespace refactor::hardware {
+
+    class Mlu final : public Device {
+    public:
+        explicit Mlu(int32_t card);
+        void setContext() const noexcept final;
+        Type type() const noexcept final {
+            return Type::Mlu;
+        }
+    };
+
+}// namespace refactor::hardware
+
+#endif// HARDWARE_DEVICES_MLU_H
diff --git a/src/02hardware/src/device_manager.cpp b/src/02hardware/src/device_manager.cpp
index bcfab3bb8..15ae3b901 100644
--- a/src/02hardware/src/device_manager.cpp
+++ b/src/02hardware/src/device_manager.cpp
@@ -1,5 +1,6 @@
 ﻿#include "hardware/device_manager.h"
 #include "hardware/devices/cpu.h"
+#include "hardware/devices/mlu.h"
 #include "hardware/devices/nvidia.h"
 
 namespace refactor::hardware::device {
@@ -37,6 +38,7 @@ namespace refactor::hardware::device {
         using T = Device::Type;
         // clang-format off
         auto device = type == T::Nvidia ? std::make_shared<Nvidia>(card)
+                    : type == T::Mlu    ? std::make_shared<Mlu>(card)
                     : UNREACHABLEX(Arc<Device>, "");
         // clang-format on
         auto [kind, ok] = DEVICES.try_emplace(static_cast<int32_t>(type));
diff --git a/src/02hardware/src/devices/cpu/memory.cc b/src/02hardware/src/devices/cpu/memory.cc
index 4db56e98e..dbd64f51e 100644
--- a/src/02hardware/src/devices/cpu/memory.cc
+++ b/src/02hardware/src/devices/cpu/memory.cc
@@ -5,19 +5,19 @@
 namespace refactor::hardware {
     using M = CpuMemory;
 
-    void *M::malloc(size_t size) noexcept {
+    void *M::malloc(size_t size) {
         return std::malloc(size);
     }
-    void M::free(void *ptr) noexcept {
+    void M::free(void *ptr) {
         std::free(ptr);
     }
-    void *M::copyHD(void *dst, void const *src, size_t bytes) const noexcept {
+    void *M::copyHD(void *dst, void const *src, size_t bytes) const {
         return std::memcpy(dst, src, bytes);
     }
-    void *M::copyDH(void *dst, void const *src, size_t bytes) const noexcept {
+    void *M::copyDH(void *dst, void const *src, size_t bytes) const {
         return std::memcpy(dst, src, bytes);
     }
-    void *M::copyDD(void *dst, void const *src, size_t bytes) const noexcept {
+    void *M::copyDD(void *dst, void const *src, size_t bytes) const {
         return std::memcpy(dst, src, bytes);
     }
 
diff --git a/src/02hardware/src/devices/cpu/memory.hh b/src/02hardware/src/devices/cpu/memory.hh
index 5bd3a1dda..d1681b24d 100644
--- a/src/02hardware/src/devices/cpu/memory.hh
+++ b/src/02hardware/src/devices/cpu/memory.hh
@@ -6,11 +6,11 @@
 namespace refactor::hardware {
 
     class CpuMemory final : public Memory {
-        void *malloc(size_t) noexcept final;
-        void free(void *) noexcept final;
-        void *copyHD(void *dst, void const *src, size_t bytes) const noexcept final;
-        void *copyDH(void *dst, void const *src, size_t bytes) const noexcept final;
-        void *copyDD(void *dst, void const *src, size_t bytes) const noexcept final;
+        void *malloc(size_t) final;
+        void free(void *) final;
+        void *copyHD(void *dst, void const *src, size_t bytes) const final;
+        void *copyDH(void *dst, void const *src, size_t bytes) const final;
+        void *copyDD(void *dst, void const *src, size_t bytes) const final;
     };
 
 }// namespace refactor::hardware
diff --git a/src/02hardware/src/devices/mlu/device.cc b/src/02hardware/src/devices/mlu/device.cc
new file mode 100644
index 000000000..ea1f6affd
--- /dev/null
+++ b/src/02hardware/src/devices/mlu/device.cc
@@ -0,0 +1,31 @@
+#include "functions.hh"
+#include "hardware/devices/mlu.h"
+#include "hardware/mem_pool.h"
+#include "memory.hh"
+
+namespace refactor::hardware {
+
+    static Arc<Memory> bangMemory(int32_t card) {
+#ifdef USE_BANG
+        ASSERT(0 <= card && card < getDeviceCount(), "Invalid card id: {}", card);
+        setDevice(card);
+        auto [free, total] = getMemInfo();
+        auto size = std::min(free, std::max(5ul << 30, total * 4 / 5));
+        fmt::println("initializing Cambricon MLU {}, memory {} / {}, alloc {}",
+                     card, free, total, size);
+        return std::make_shared<MemPool>(
+            std::make_shared<MluMemory>(),
+            size,
+            256ul);
+#else
+        return nullptr;
+#endif
+    }
+
+    Mlu::Mlu(int32_t card) : Device(card, bangMemory(card)) {}
+
+    void Mlu::setContext() const noexcept {
+        setDevice(_card);
+    }
+
+}// namespace refactor::hardware
diff --git a/src/02hardware/src/devices/mlu/functions.cc b/src/02hardware/src/devices/mlu/functions.cc
new file mode 100644
index 000000000..bedea0458
--- /dev/null
+++ b/src/02hardware/src/devices/mlu/functions.cc
@@ -0,0 +1,21 @@
+#include "functions.hh"
+
+namespace refactor::hardware {
+
+#ifdef USE_BANG
+    int getDeviceCount() {
+        unsigned deviceCount;
+        BANG_ASSERT(cnrtGetDeviceCount(&deviceCount));
+        return static_cast<int>(deviceCount);
+    }
+    void setDevice(int device) {
+        BANG_ASSERT(cnrtSetDevice(device));
+    }
+    MemInfo getMemInfo() {
+        MemInfo memInfo;
+        BANG_ASSERT(cnrtMemGetInfo(&memInfo.free, &memInfo.total));
+        return memInfo;
+    }
+#endif
+
+}// namespace refactor::hardware
diff --git a/src/02hardware/src/devices/mlu/functions.hh b/src/02hardware/src/devices/mlu/functions.hh
new file mode 100644
index 000000000..f12faab4b
--- /dev/null
+++ b/src/02hardware/src/devices/mlu/functions.hh
@@ -0,0 +1,28 @@
+#ifndef HARDWARE_DEVICES_MLU_FUNCTIONS_CUH
+#define HARDWARE_DEVICES_MLU_FUNCTIONS_CUH
+
+#include "common.h"
+
+#ifdef USE_BANG
+#include "cnrt.h"
+
+#define BANG_ASSERT(STATUS)                                                          \
+    if (auto status = (STATUS); status != CNRT_RET_SUCCESS) {                        \
+        RUNTIME_ERROR(fmt::format("bang failed on \"" #STATUS "\" with \"{}\" ({})", \
+                                  cnrtGetErrorStr(status), (int) status));           \
+    }
+#endif
+
+namespace refactor::hardware {
+
+    struct MemInfo {
+        size_t free, total;
+    };
+
+    int getDeviceCount();
+    void setDevice(int device);
+    MemInfo getMemInfo();
+
+}// namespace refactor::hardware
+
+#endif// HARDWARE_DEVICES_NVIDIA_FUNCTIONS_CUH
diff --git a/src/02hardware/src/devices/mlu/memory.cc b/src/02hardware/src/devices/mlu/memory.cc
new file mode 100644
index 000000000..55550314a
--- /dev/null
+++ b/src/02hardware/src/devices/mlu/memory.cc
@@ -0,0 +1,33 @@
+#include "memory.hh"
+#include "functions.hh"
+
+namespace refactor::hardware {
+#ifdef USE_BANG
+    using M = MluMemory;
+
+    void *M::malloc(size_t size) {
+        void *ptr;
+        BANG_ASSERT(cnrtMalloc(&ptr, size));
+        return ptr;
+    }
+    void M::free(void *ptr) {
+        BANG_ASSERT(cnrtFree(ptr));
+    }
+    void *M::copyHD(void *dst, void const *src, size_t bytes) const {
+        BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
+                               CNRT_MEM_TRANS_DIR_HOST2DEV))
+        return dst;
+    }
+    void *M::copyDH(void *dst, void const *src, size_t bytes) const {
+        BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
+                               CNRT_MEM_TRANS_DIR_DEV2HOST));
+        return dst;
+    }
+    void *M::copyDD(void *dst, void const *src, size_t bytes) const {
+        BANG_ASSERT(cnrtMemcpy(dst, const_cast<void *>(src), bytes,
+                               CNRT_MEM_TRANS_DIR_PEER2PEER));
+        return dst;
+    }
+#endif
+
+}// namespace refactor::hardware
diff --git a/src/02hardware/src/devices/mlu/memory.hh b/src/02hardware/src/devices/mlu/memory.hh
new file mode 100644
index 000000000..85ec39887
--- /dev/null
+++ b/src/02hardware/src/devices/mlu/memory.hh
@@ -0,0 +1,18 @@
+#ifndef HARDWARE_DEVICES_MLU_MEMORY_CUH
+#define HARDWARE_DEVICES_MLU_MEMORY_CUH
+
+#include "hardware/memory.h"
+
+namespace refactor::hardware {
+
+    class MluMemory final : public Memory {
+        void *malloc(size_t) final;
+        void free(void *) final;
+        void *copyHD(void *dst, void const *src, size_t bytes) const final;
+        void *copyDH(void *dst, void const *src, size_t bytes) const final;
+        void *copyDD(void *dst, void const *src, size_t bytes) const final;
+    };
+
+}// namespace refactor::hardware
+
+#endif// HARDWARE_DEVICES_MLU_MEMORY_HH
diff --git a/src/02hardware/src/devices/nvidia/device.cc b/src/02hardware/src/devices/nvidia/device.cc
index a5dc71631..7b25cc1f8 100644
--- a/src/02hardware/src/devices/nvidia/device.cc
+++ b/src/02hardware/src/devices/nvidia/device.cc
@@ -34,7 +34,7 @@ namespace refactor::hardware {
             size,
             alignment);
 #else
-        RUNTIME_ERROR("CUDA is not enabled");
+        return nullptr;
 #endif
     }
 
diff --git a/src/04kernel/CMakeLists.txt b/src/04kernel/CMakeLists.txt
index 75e7e50fe..3a401ac35 100644
--- a/src/04kernel/CMakeLists.txt
+++ b/src/04kernel/CMakeLists.txt
@@ -35,6 +35,9 @@ if(USE_KUNLUN)
     find_library(KUNLUN_DNN libxpuapi.so ${KUNLUN_HOME}/XTDK/shlib)
     target_link_libraries(kernel PUBLIC ${KUNLUN_RT} ${KUNLUN_DNN})
 endif()
+if(USE_BANG)
+    target_link_libraries(kernel PUBLIC ${CAMBRICON_CNNL} ${CAMBRICON_CNRT} ${CAMBRICON_CNDRV})
+endif()
 
 file(GLOB_RECURSE KERNEL_TEST test/*.cpp)
 if(KERNEL_TEST)