JuliaGPU · maleadt · Jan 28, 2022 · Jan 28, 2022 · Jan 28, 2022 · Jan 28, 2022
diff --git a/lib/cudadrv/devices.jl b/lib/cudadrv/devices.jl
@@ -163,7 +163,7 @@ end
 
 ## attributes
 
-export attribute, warpsize, capability, unified_addressing
+export attribute, warpsize, capability, memory_pools_supported, unified_addressing
 
 """
     attribute(dev::CuDevice, code)
@@ -195,11 +195,10 @@ function capability(dev::CuDevice)
                          attribute(dev, DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR))
 end
 
-has_stream_ordered(dev::CuDevice) =
-    @memoize dev::CuDevice begin
-        CUDA.version() >= v"11.2" && !haskey(ENV, "CUDA_MEMCHECK") &&
-        attribute(dev, DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) == 1
-    end::Bool
+memory_pools_supported(dev::CuDevice) =
+    CUDA.version() >= v"11.2" &&
+    attribute(dev, DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) == 1
+@deprecate has_stream_ordered(dev::CuDevice) memory_pools_supported(dev)
 
 unified_addressing(dev::CuDevice) =
     attribute(dev, DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) == 1

diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl
@@ -68,7 +68,7 @@ GPU, and requires explicit calls to `unsafe_copyto!`, which wraps `cuMemcpy`,
 for access on the CPU.
 """
 function alloc(::Type{DeviceBuffer}, bytesize::Integer;
-               async::Bool=CUDA.has_stream_ordered(device()),
+               async::Bool=memory_pools_supported(device()),
                stream::Union{Nothing,CuStream}=nothing,
                pool::Union{Nothing,CuMemoryPool}=nothing)
     bytesize == 0 && return DeviceBuffer()

diff --git a/src/pool.jl b/src/pool.jl
@@ -66,17 +66,10 @@ end
 
 const __stream_ordered = LazyInitialized{Vector{Bool}}()
 function stream_ordered(dev::CuDevice)
+  # TODO: improve @memoize to use the device ID to index a know-length vector cache.
   flags = get!(__stream_ordered) do
-    val = Vector{Bool}(undef, ndevices())
-    if version() < v"11.2" || haskey(ENV, "CUDA_MEMCHECK") ||
-       get(ENV, "JULIA_CUDA_MEMORY_POOL", "cuda") == "none"
-      fill!(val, false)
-    else
-      for dev in devices()
-        val[deviceid(dev)+1] = attribute(dev, DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) == 1
-      end
-    end
-    val
+    [memory_pools_supported(dev) && get(ENV, "JULIA_CUDA_MEMORY_POOL", "cuda") == "cuda"
+     for dev in devices()]
   end
   @inbounds flags[deviceid(dev)+1]
 end
@@ -92,6 +85,9 @@ function pool_mark(dev::CuDevice)
   if status[] === nothing
       pool = memory_pool(dev)
 
+      # allow the pool to use up all memory of this device
+      attribute!(memory_pool(dev), MEMPOOL_ATTR_RELEASE_THRESHOLD, typemax(UInt64))
+
       # launch a task to periodically trim the pool
       if isinteractive() && !isassigned(__pool_cleanup)
         __pool_cleanup[] = @async pool_cleanup()

diff --git a/test/cudadrv.jl b/test/cudadrv.jl
@@ -455,7 +455,7 @@ for srcTy in [Mem.Device, Mem.Host, Mem.Unified],
 
     # test device with context in which pointer was allocated.
     @test device(typed_pointer(src, T)) == device()
-    if !CUDA.has_stream_ordered(device())
+    if !memory_pools_supported(device())
         # NVIDIA bug #3319609
         @test context(typed_pointer(src, T)) == context()
     end