add version.h to cover version problem

MuGdxy · Feb 26, 2023 · d28ec9c · d28ec9c
1 parent 1b14da9
commit d28ec9c
Show file tree

Hide file tree

Showing 20 changed files with 119 additions and 64 deletions.
diff --git a/example/profile/kernel_name.cu b/example/profile/kernel_name.cu
@@ -2,17 +2,17 @@
 #include <muda/muda.h>
 #include <muda/container.h>
 #include "../example_common.h"
+#include <numeric> // iota
 using namespace muda;
 
 void kernel_name()
 {
     example_desc("use <profile> and <range_name> to help debugging and profiling.");
-    universal_vector<int> h(100);
-    universal_var<int>    v(1);
-    for(size_t i = 0; i < h.size(); i++)
-    {
-        h[i] = i;
-    }
+    host_vector<int> h_vec(100);
+    device_vector<int> vec(100);
+    device_var<int>    v(1);
+    std::iota(h_vec.begin(), h_vec.end(), 0);
+    vec = h_vec;
 
     {  //give a scope for RAII of profile and range_name
 
@@ -22,8 +22,8 @@ void kernel_name()
 
         range_name r("kernel apply");  // give a name to this scope.
         parallel_for(32, 32)
-            .apply(h.size(),
-                   [s = make_viewer(h), v = make_viewer(v)] __device__(int i) mutable
+            .apply(vec.size(),
+                   [s = make_viewer(vec), v = make_viewer(v)] __device__(int i) mutable
                    {
                        __shared__ int b[64];
                        s(i)           = v;

diff --git a/src/core/muda/check/checkCusparse.h b/src/core/muda/check/checkCusparse.h
@@ -35,4 +35,4 @@ __host__ __device__ inline const char* mudaCudaGetErrorEnum(cusparseStatus_t err
     }
 
     return "<unknown>";
-}
+}
diff --git a/src/core/muda/graph/base.h b/src/core/muda/graph/base.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <muda/tools/version.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <cuda_device_runtime_api.h>

diff --git a/src/core/muda/graph/graph.h b/src/core/muda/graph/graph.h
@@ -52,19 +52,6 @@ class graph
         return ret;
     }
 
-    template <typename T>
-    auto addMemAllocNode(sptr<memAllocNodeParms<T>>&         memAllocParms,
-                         const std::vector<sptr<graphNode>>& deps = {})
-    {
-        auto                         node  = std::make_shared<memAllocNode>();
-        std::vector<cudaGraphNode_t> nodes = mapDependencies(deps);
-        checkCudaErrors(cudaGraphAddMemAllocNode(
-            &node->m_handle, m_handle, nodes.data(), nodes.size(), memAllocParms->getRaw()));
-        auto ptr     = reinterpret_cast<T*>(memAllocParms->getRaw()->dptr);
-        node->m_dptr = ptr;
-        return std::make_tuple(node, ptr);
-    }
-
     template <typename T>
     auto addMemcpyNode(T*                                  dst,
                        const T*                            src,
@@ -79,6 +66,20 @@ class graph
         return ret;
     }
 
+#ifdef MUDA_WITH_GRAPH_MEMORY_ALLOC_FREE
+    template <typename T>
+    auto addMemAllocNode(sptr<memAllocNodeParms<T>>&         memAllocParms,
+                         const std::vector<sptr<graphNode>>& deps = {})
+    {
+        auto                         node  = std::make_shared<memAllocNode>();
+        std::vector<cudaGraphNode_t> nodes = mapDependencies(deps);
+        checkCudaErrors(cudaGraphAddMemAllocNode(
+            &node->m_handle, m_handle, nodes.data(), nodes.size(), memAllocParms->getRaw()));
+        auto ptr     = reinterpret_cast<T*>(memAllocParms->getRaw()->dptr);
+        node->m_dptr = ptr;
+        return std::make_tuple(node, ptr);
+    }
+
     auto addMemFreeNode(sptr<memAllocNode>                  allocNode,
                         const std::vector<sptr<graphNode>>& deps = {})
     {
@@ -97,6 +98,7 @@ class graph
             &ret->m_handle, m_handle, nodes.data(), nodes.size(), ptr));
         return ret;
     }
+#endif
 
     auto addEventRecordNode(cudaEvent_t e, const std::vector<sptr<graphNode>>& deps = {})
     {

diff --git a/src/core/muda/graph/memory_node.h b/src/core/muda/graph/memory_node.h
@@ -3,6 +3,7 @@
 
 namespace muda
 {
+#ifdef MUDA_WITH_GRAPH_MEMORY_ALLOC_FREE
 class memAllocNode : public graphNode
 {
     void* m_dptr;
@@ -43,6 +44,7 @@ class memFreeNode : public graphNode
     using this_type = memFreeNode;
     friend class graph;
 };
+#endif
 
 class memcpyNode : public graphNode
 {

diff --git a/src/core/muda/launch/memory.h b/src/core/muda/launch/memory.h
@@ -22,18 +22,22 @@ class memory : public launch_base<memory>
         return *this;
     }
 
-    memory& copy(void* dst, const void* src, size_t byte_size, cudaMemcpyKind kind)
-    {
-        checkCudaErrors(cudaMemcpyAsync(dst, src, byte_size, kind, m_stream));
-        return *this;
-    }
-
+#ifdef MUDA_WITH_GRAPH_MEMORY_ALLOC_FREE
     template <typename T>
     MUDA_NODISCARD static auto asAllocNodeParms(size_t count)
     {
         auto parms = std::make_shared<memAllocNodeParms<T>>(count);
         return parms;
     }
+#endif
+
+    memory& copy(void* dst, const void* src, size_t byte_size, cudaMemcpyKind kind)
+    {
+        checkCudaErrors(cudaMemcpyAsync(dst, src, byte_size, kind, m_stream));
+        return *this;
+    }
+
+
 
     memory& set(void* data, size_t byte_size, char byte = 0)
     {

diff --git a/src/core/muda/tools/version.h b/src/core/muda/tools/version.h
@@ -0,0 +1,13 @@
+#pragma once
+// muda's baseline cuda version  is 11.6
+#define MUDA_BASELINE_CUDACC_VER_MAJOR 11
+#define MUDA_BASELINE_CUDACC_VER_MINOR 6
+
+#if (__CUDACC_VER_MAJOR__ >=  MUDA_BASELINE_CUDACC_VER_MAJOR) && \
+    __CUDACC_VER_MINOR__ >= MUDA_BASELINE_CUDACC_VER_MINOR
+
+#define MUDA_BASELINE_CUDACC_VER_SATISFIED
+#define MUDA_WITH_THRUST_UNIVERSAL
+#define MUDA_WITH_GRAPH_MEMORY_ALLOC_FREE
+
+#endif
diff --git a/src/ext/muda/blas/blas.h b/src/ext/muda/blas/blas.h
@@ -1,4 +1,6 @@
 #pragma once
+#include <muda/tools/version.h>
+
 #include <muda/check/checkCusparse.h>
 #include <muda/check/checkCublas.h>
 #include <muda/check/checkCudaErrors.h>
@@ -24,22 +26,25 @@ class blasContext
     blasContext(cudaStream_t stream = nullptr)
         : m_stream(stream)
     {
-        checkCudaErrors(cusparseCreate(&m_csh));
         checkCudaErrors(cublasCreate_v2(&m_cbh));
-        checkCudaErrors(cusparseSetStream(m_csh, stream));
         checkCudaErrors(cublasSetStream_v2(m_cbh, stream));
+
+        checkCudaErrors(cusparseCreate(&m_csh));
+        checkCudaErrors(cusparseSetStream(m_csh, stream));
     }
+
     ~blasContext()
     {
-        checkCudaErrors(cusparseDestroy(m_csh));
         checkCudaErrors(cublasDestroy_v2(m_cbh));
+
+        checkCudaErrors(cusparseDestroy(m_csh));
     }
 
     operator cusparseHandle_t() { return m_csh; }
+    cusparseHandle_t spHandle() { return m_csh; };
+
     operator cublasHandle_t() { return m_cbh; }
     operator cudaStream_t() { return m_stream; }
-
-    cusparseHandle_t spHandle() { return m_csh; };
     cublasHandle_t   dnHandle() { return m_cbh; };
     cudaStream_t     stream() { return m_stream; };
 };
@@ -144,7 +149,7 @@ class blas : public launch_base<blas>
                                                 b.data(),
                                                 y_inout,
                                                 details::cudaDataTypeMap_v<value_type>,
-                                                cusparseSpMVAlg_t::CUSPARSE_SPMV_ALG_DEFAULT,
+                                                cusparseSpMVAlg_t::CUSPARSE_MV_ALG_DEFAULT,
                                                 &bufferSize));
 
         details::set_stream_check(external_buffer, m_ctx);
@@ -158,7 +163,7 @@ class blas : public launch_base<blas>
                                      b.data(),
                                      y_inout,
                                      details::cudaDataTypeMap_v<value_type>,
-                                     cusparseSpMVAlg_t::CUSPARSE_SPMV_ALG_DEFAULT,
+                                     cusparseSpMVAlg_t::CUSPARSE_MV_ALG_DEFAULT,
                                      external_buffer.data()));
         return *this;
     }

diff --git a/src/ext/muda/blas/data_type_map.h b/src/ext/muda/blas/data_type_map.h
@@ -54,7 +54,6 @@ namespace details
     //template<> constexpr cudaDataType cudaDataTypeMap_v<	> = CUDA_C_64I;
     //template<> constexpr cudaDataType cudaDataTypeMap_v<	> = CUDA_C_64U;
 
-
     template <typename T>
     struct cusparseIndexTypeMap
     {

diff --git a/src/ext/muda/blas/dense.h b/src/ext/muda/blas/dense.h
@@ -1,4 +1,6 @@
 #pragma once
+#include <muda/tools/version.h>
+
 #include <cublas.h>
 #include "data_type_map.h"
 #include <muda/check/checkCublas.h>
@@ -18,13 +20,19 @@ class dense_vec
     {
         checkCudaErrors(cusparseCreateDnVec(&m_dnvec, n, data, details::cudaDataTypeMap_v<T>));
     }
-    ~dense_vec() { checkCudaErrors(cusparseDestroyDnVec(m_dnvec)); }
+
+    ~dense_vec() 
+    { 
+        checkCudaErrors(cusparseDestroyDnVec(m_dnvec)); 
+    }
+
     value_type*       data() { return m_data; }
     const value_type* data() const { return m_data; }
     size_t            size() const { return m_size; }
-                      operator cusparseDnVecDescr_t() { return m_dnvec; }
+    operator cusparseDnVecDescr_t() { return m_dnvec; }
 
   private:
+
     cusparseDnVecDescr_t m_dnvec;
     value_type*          m_data;
     size_t               m_size;
@@ -51,4 +59,4 @@ inline __host__ auto make_dense_vec(device_buffer<T>& buf)
 {
     return dense_vec<T>(buf.data(), buf.size());
 }
-}  // namespace muda
+}  // namespace muda
diff --git a/src/ext/muda/blas/mat_view.h b/src/ext/muda/blas/mat_view.h
@@ -16,12 +16,13 @@ class mat_view
         , m_trans(CUSPARSE_OPERATION_NON_TRANSPOSE)
     {
     }
-    
+
     mat_view(matrix_type& mat, cusparseOperation_t trans)
         : m_mat(mat)
         , m_trans(trans)
     {
     }
+
 
     matrix_type&        m_mat;
     cusparseOperation_t m_trans;

diff --git a/src/ext/muda/blas/sparse.h b/src/ext/muda/blas/sparse.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <muda/tools/version.h>
 #include <cublas.h>
 #include <cusparse.h>
 #include <muda/container/vector.h>
@@ -143,4 +144,4 @@ MUDA_INLINE MUDA_HOST auto make_viewer(matCSR<T>& m) MUDA_NOEXCEPT
 {
     return make_csr(m);
 }
-}  // namespace muda
+}  // namespace muda
diff --git a/src/ext/muda/composite/cse.h b/src/ext/muda/composite/cse.h
@@ -64,9 +64,12 @@ template <typename T>
 using host_cse =
     compressed_sparse_elements<host_vector<T>, host_vector<int>, host_vector<int>>;
 
+
+#ifdef MUDA_WITH_THRUST_UNIVERSAL
 template <typename T>
 using universal_cse =
     compressed_sparse_elements<universal_vector<T>, universal_vector<int>, universal_vector<int>>;
+#endif
 
 template <typename T>
 class device_buffer_cse

diff --git a/src/ext/muda/container/var.h b/src/ext/muda/container/var.h
@@ -1,11 +1,17 @@
 #pragma once
+#include <muda/tools/version.h>
 #include <vector>
 #include <thrust/device_allocator.h>
+
+#ifdef MUDA_WITH_THRUST_UNIVERSAL
 #include <thrust/universal_allocator.h>
+#endif
+
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/fill.h>
 #include <thrust/copy.h>
 #include <muda/muda_def.h>
+#include <muda/viewer/dense.h>
 
 namespace muda
 {
@@ -59,13 +65,17 @@ namespace details
 template <typename T, typename Allocator = thrust::device_allocator<T>>
 using device_var = details::var_base<T, Allocator>;
 
+#ifdef MUDA_WITH_THRUST_UNIVERSAL
 template <typename T, typename Allocator = thrust::universal_allocator<T>>
 using universal_var = details::var_base<T, Allocator>;
+#endif
 
 template <typename T, typename Allocator = std::allocator<T>>
 using host_var = details::var_base<T, Allocator>;
 }  // namespace muda
 
+
+// cast
 namespace muda
 {
 template <typename T, typename Allocator>
@@ -81,7 +91,8 @@ MUDA_INLINE T* data(details::var_base<T, Allocator>& v) MUDA_NOEXCEPT
 }
 }  // namespace muda
 
-#include <muda/viewer/dense.h>
+
+// viewer
 namespace muda
 {
 template <typename T, typename Allocator>