From 07aa55aad1f178a5b4f4fb1d489e4ed3a40f00d7 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 28 Jan 2022 11:02:18 +0100 Subject: [PATCH 1/3] Use the memory pool with compute-sanitizer. This is supported nowadays, and slightly speeds up testing. --- src/pool.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pool.jl b/src/pool.jl index a187a6d7b1..e7300c96ee 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -68,8 +68,7 @@ const __stream_ordered = LazyInitialized{Vector{Bool}}() function stream_ordered(dev::CuDevice) flags = get!(__stream_ordered) do val = Vector{Bool}(undef, ndevices()) - if version() < v"11.2" || haskey(ENV, "CUDA_MEMCHECK") || - get(ENV, "JULIA_CUDA_MEMORY_POOL", "cuda") == "none" + if version() < v"11.2" || get(ENV, "JULIA_CUDA_MEMORY_POOL", "cuda") == "none" fill!(val, false) else for dev in devices() From 5c4e4ed276a3ca06214bfdeb7373ec48f5985eb5 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 28 Jan 2022 11:02:38 +0100 Subject: [PATCH 2/3] Configure the pool to use all memory. Otherwise we spend too much time releasing memory upon every synchronization. --- src/pool.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pool.jl b/src/pool.jl index e7300c96ee..b530cf9794 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -91,6 +91,9 @@ function pool_mark(dev::CuDevice) if status[] === nothing pool = memory_pool(dev) + # allow the pool to use up all memory of this device + attribute!(memory_pool(dev), MEMPOOL_ATTR_RELEASE_THRESHOLD, typemax(UInt64)) + # launch a task to periodically trim the pool if isinteractive() && !isassigned(__pool_cleanup) __pool_cleanup[] = @async pool_cleanup() From f1f306c8c4e85bf8b32008e1ac3418b6c8d02f86 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 28 Jan 2022 11:38:46 +0100 Subject: [PATCH 3/3] Simplify checking for memory pool support. --- lib/cudadrv/devices.jl | 11 +++++------ lib/cudadrv/memory.jl | 2 +- src/pool.jl | 12 +++--------- test/cudadrv.jl | 2 +- 4 files changed, 10 insertions(+), 17 deletions(-) diff --git a/lib/cudadrv/devices.jl b/lib/cudadrv/devices.jl index 483972ef21..543524cd19 100644 --- a/lib/cudadrv/devices.jl +++ b/lib/cudadrv/devices.jl @@ -163,7 +163,7 @@ end ## attributes -export attribute, warpsize, capability, unified_addressing +export attribute, warpsize, capability, memory_pools_supported, unified_addressing """ attribute(dev::CuDevice, code) @@ -195,11 +195,10 @@ function capability(dev::CuDevice) attribute(dev, DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)) end -has_stream_ordered(dev::CuDevice) = - @memoize dev::CuDevice begin - CUDA.version() >= v"11.2" && !haskey(ENV, "CUDA_MEMCHECK") && - attribute(dev, DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) == 1 - end::Bool +memory_pools_supported(dev::CuDevice) = + CUDA.version() >= v"11.2" && + attribute(dev, DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) == 1 +@deprecate has_stream_ordered(dev::CuDevice) memory_pools_supported(dev) unified_addressing(dev::CuDevice) = attribute(dev, DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) == 1 diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl index b63a33ba80..d87195fc97 100644 --- a/lib/cudadrv/memory.jl +++ b/lib/cudadrv/memory.jl @@ -68,7 +68,7 @@ GPU, and requires explicit calls to `unsafe_copyto!`, which wraps `cuMemcpy`, for access on the CPU. """ function alloc(::Type{DeviceBuffer}, bytesize::Integer; - async::Bool=CUDA.has_stream_ordered(device()), + async::Bool=memory_pools_supported(device()), stream::Union{Nothing,CuStream}=nothing, pool::Union{Nothing,CuMemoryPool}=nothing) bytesize == 0 && return DeviceBuffer() diff --git a/src/pool.jl b/src/pool.jl index b530cf9794..f506fd00f6 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -66,16 +66,10 @@ end const __stream_ordered = LazyInitialized{Vector{Bool}}() function stream_ordered(dev::CuDevice) + # TODO: improve @memoize to use the device ID to index a know-length vector cache. flags = get!(__stream_ordered) do - val = Vector{Bool}(undef, ndevices()) - if version() < v"11.2" || get(ENV, "JULIA_CUDA_MEMORY_POOL", "cuda") == "none" - fill!(val, false) - else - for dev in devices() - val[deviceid(dev)+1] = attribute(dev, DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) == 1 - end - end - val + [memory_pools_supported(dev) && get(ENV, "JULIA_CUDA_MEMORY_POOL", "cuda") == "cuda" + for dev in devices()] end @inbounds flags[deviceid(dev)+1] end diff --git a/test/cudadrv.jl b/test/cudadrv.jl index 82488eecde..e9b3b1e246 100644 --- a/test/cudadrv.jl +++ b/test/cudadrv.jl @@ -455,7 +455,7 @@ for srcTy in [Mem.Device, Mem.Host, Mem.Unified], # test device with context in which pointer was allocated. @test device(typed_pointer(src, T)) == device() - if !CUDA.has_stream_ordered(device()) + if !memory_pools_supported(device()) # NVIDIA bug #3319609 @test context(typed_pointer(src, T)) == context() end