From 07aa55aad1f178a5b4f4fb1d489e4ed3a40f00d7 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 28 Jan 2022 11:02:18 +0100
Subject: [PATCH 1/3] Use the memory pool with compute-sanitizer.

This is supported nowadays, and slightly speeds up testing.
---
 src/pool.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pool.jl b/src/pool.jl
index a187a6d7b1..e7300c96ee 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -68,8 +68,7 @@ const __stream_ordered = LazyInitialized{Vector{Bool}}()
 function stream_ordered(dev::CuDevice)
   flags = get!(__stream_ordered) do
     val = Vector{Bool}(undef, ndevices())
-    if version() < v"11.2" || haskey(ENV, "CUDA_MEMCHECK") ||
-       get(ENV, "JULIA_CUDA_MEMORY_POOL", "cuda") == "none"
+    if version() < v"11.2" || get(ENV, "JULIA_CUDA_MEMORY_POOL", "cuda") == "none"
       fill!(val, false)
     else
       for dev in devices()

From 5c4e4ed276a3ca06214bfdeb7373ec48f5985eb5 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 28 Jan 2022 11:02:38 +0100
Subject: [PATCH 2/3] Configure the pool to use all memory.

Otherwise we spend too much time releasing memory upon every synchronization.
---
 src/pool.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/pool.jl b/src/pool.jl
index e7300c96ee..b530cf9794 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -91,6 +91,9 @@ function pool_mark(dev::CuDevice)
   if status[] === nothing
       pool = memory_pool(dev)
 
+      # allow the pool to use up all memory of this device
+      attribute!(memory_pool(dev), MEMPOOL_ATTR_RELEASE_THRESHOLD, typemax(UInt64))
+
       # launch a task to periodically trim the pool
       if isinteractive() && !isassigned(__pool_cleanup)
         __pool_cleanup[] = @async pool_cleanup()

From f1f306c8c4e85bf8b32008e1ac3418b6c8d02f86 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 28 Jan 2022 11:38:46 +0100
Subject: [PATCH 3/3] Simplify checking for memory pool support.

---
 lib/cudadrv/devices.jl | 11 +++++------
 lib/cudadrv/memory.jl  |  2 +-
 src/pool.jl            | 12 +++---------
 test/cudadrv.jl        |  2 +-
 4 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/lib/cudadrv/devices.jl b/lib/cudadrv/devices.jl
index 483972ef21..543524cd19 100644
--- a/lib/cudadrv/devices.jl
+++ b/lib/cudadrv/devices.jl
@@ -163,7 +163,7 @@ end
 
 ## attributes
 
-export attribute, warpsize, capability, unified_addressing
+export attribute, warpsize, capability, memory_pools_supported, unified_addressing
 
 """
     attribute(dev::CuDevice, code)
@@ -195,11 +195,10 @@ function capability(dev::CuDevice)
                          attribute(dev, DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR))
 end
 
-has_stream_ordered(dev::CuDevice) =
-    @memoize dev::CuDevice begin
-        CUDA.version() >= v"11.2" && !haskey(ENV, "CUDA_MEMCHECK") &&
-        attribute(dev, DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) == 1
-    end::Bool
+memory_pools_supported(dev::CuDevice) =
+    CUDA.version() >= v"11.2" &&
+    attribute(dev, DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) == 1
+@deprecate has_stream_ordered(dev::CuDevice) memory_pools_supported(dev)
 
 unified_addressing(dev::CuDevice) =
     attribute(dev, DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) == 1
diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl
index b63a33ba80..d87195fc97 100644
--- a/lib/cudadrv/memory.jl
+++ b/lib/cudadrv/memory.jl
@@ -68,7 +68,7 @@ GPU, and requires explicit calls to `unsafe_copyto!`, which wraps `cuMemcpy`,
 for access on the CPU.
 """
 function alloc(::Type{DeviceBuffer}, bytesize::Integer;
-               async::Bool=CUDA.has_stream_ordered(device()),
+               async::Bool=memory_pools_supported(device()),
                stream::Union{Nothing,CuStream}=nothing,
                pool::Union{Nothing,CuMemoryPool}=nothing)
     bytesize == 0 && return DeviceBuffer()
diff --git a/src/pool.jl b/src/pool.jl
index b530cf9794..f506fd00f6 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -66,16 +66,10 @@ end
 
 const __stream_ordered = LazyInitialized{Vector{Bool}}()
 function stream_ordered(dev::CuDevice)
+  # TODO: improve @memoize to use the device ID to index a know-length vector cache.
   flags = get!(__stream_ordered) do
-    val = Vector{Bool}(undef, ndevices())
-    if version() < v"11.2" || get(ENV, "JULIA_CUDA_MEMORY_POOL", "cuda") == "none"
-      fill!(val, false)
-    else
-      for dev in devices()
-        val[deviceid(dev)+1] = attribute(dev, DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) == 1
-      end
-    end
-    val
+    [memory_pools_supported(dev) && get(ENV, "JULIA_CUDA_MEMORY_POOL", "cuda") == "cuda"
+     for dev in devices()]
   end
   @inbounds flags[deviceid(dev)+1]
 end
diff --git a/test/cudadrv.jl b/test/cudadrv.jl
index 82488eecde..e9b3b1e246 100644
--- a/test/cudadrv.jl
+++ b/test/cudadrv.jl
@@ -455,7 +455,7 @@ for srcTy in [Mem.Device, Mem.Host, Mem.Unified],
 
     # test device with context in which pointer was allocated.
     @test device(typed_pointer(src, T)) == device()
-    if !CUDA.has_stream_ordered(device())
+    if !memory_pools_supported(device())
         # NVIDIA bug #3319609
         @test context(typed_pointer(src, T)) == context()
     end