From 36be160cefe849dad375a0ef8087bd7f3a78bb7a Mon Sep 17 00:00:00 2001 From: Oliver Schulz Date: Sun, 22 May 2022 13:44:41 +0200 Subject: [PATCH] Support Adapt.AbstractGPUDevice --- docs/src/index.md | 2 +- examples/matmul.jl | 4 ++-- examples/memcopy.jl | 2 +- examples/memcopy_static.jl | 2 +- examples/mpi.jl | 2 +- examples/naive_transpose.jl | 4 ++-- examples/performance.jl | 12 ++++++------ lib/CUDAKernels/src/CUDAKernels.jl | 25 +++++++++++++------------ lib/ROCKernels/src/ROCKernels.jl | 4 +--- src/KernelAbstractions.jl | 23 ++++------------------- test/test.jl | 14 -------------- 11 files changed, 32 insertions(+), 62 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 4982516a..ec6a2ba0 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -56,7 +56,7 @@ kernel on it instead. For example, launching on a CUDA GPU: ```julia using CUDAKernels # Required to access CUDADevice A = CUDA.ones(1024, 1024) -kernel = mul2(CUDADevice(), 16) +kernel = mul2(get_computing_device(A), 16) # ... the rest is the same! ``` diff --git a/examples/matmul.jl b/examples/matmul.jl index 054f8c15..d9939820 100644 --- a/examples/matmul.jl +++ b/examples/matmul.jl @@ -1,4 +1,4 @@ -using KernelAbstractions, Test +using KernelAbstractions, Adapt, Test include(joinpath(@__DIR__, "utils.jl")) # Load backend if has_cuda && has_cuda_gpu() @@ -24,7 +24,7 @@ function matmul!(a, b, c) println("Matrix size mismatch!") return nothing end - device = KernelAbstractions.get_device(a) + device = get_computing_device(a) n = device isa GPU ? 256 : 4 kernel! = matmul_kernel!(device, n) kernel!(a, b, c, ndrange=size(c)) diff --git a/examples/memcopy.jl b/examples/memcopy.jl index b51e83d4..cbe6fe80 100644 --- a/examples/memcopy.jl +++ b/examples/memcopy.jl @@ -22,7 +22,7 @@ wait(event) if has_cuda && has_cuda_gpu() function mycopy!(A::CuArray, B::CuArray) @assert size(A) == size(B) - copy_kernel!(CUDADevice(), 256)(A, B, ndrange=length(A)) + copy_kernel!(get_computing_device(A), 256)(A, B, ndrange=length(A)) end A = CuArray{Float32}(undef, 1024) diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl index cac10886..5a3b5854 100644 --- a/examples/memcopy_static.jl +++ b/examples/memcopy_static.jl @@ -22,7 +22,7 @@ if has_cuda && has_cuda_gpu() function mycopy_static!(A::CuArray, B::CuArray) @assert size(A) == size(B) - kernel = copy_kernel!(CUDADevice(), 32, size(A)) # if size(A) varies this will cause recompilation + kernel = copy_kernel!(get_computing_device(A), 32, size(A)) # if size(A) varies this will cause recompilation kernel(A, B, ndrange=size(A)) end diff --git a/examples/mpi.jl b/examples/mpi.jl index 369666df..3121bb8c 100644 --- a/examples/mpi.jl +++ b/examples/mpi.jl @@ -11,7 +11,7 @@ end using MPI -device(A) = typeof(A) <: Array ? CPU() : CUDADevice() +device(A) = typeof(A) <: Array ? CPU() : CUDA.device() function mpiyield() MPI.Iprobe(MPI.MPI_ANY_SOURCE, MPI.MPI_ANY_TAG, MPI.COMM_WORLD) diff --git a/examples/naive_transpose.jl b/examples/naive_transpose.jl index 77877647..38e75778 100644 --- a/examples/naive_transpose.jl +++ b/examples/naive_transpose.jl @@ -1,4 +1,4 @@ -using KernelAbstractions, Test +using KernelAbstractions, Adapt, Test include(joinpath(@__DIR__, "utils.jl")) # Load backend if has_cuda && has_cuda_gpu() @@ -17,7 +17,7 @@ function naive_transpose!(a, b) println("Matrix size mismatch!") return nothing end - device = KernelAbstractions.get_device(a) + device = get_computing_device(a) n = device isa GPU ? 256 : 4 kernel! = naive_transpose_kernel!(device, n) kernel!(a, b, ndrange=size(a)) diff --git a/examples/performance.jl b/examples/performance.jl index c223fd7f..a4f184af 100644 --- a/examples/performance.jl +++ b/examples/performance.jl @@ -133,8 +133,8 @@ end for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM*TILE_DIM, 1), (1, TILE_DIM*TILE_DIM)) for (name, kernel) in ( - ("copy", simple_copy_kernel!(CUDADevice(), block_dims)), - ("transpose", simple_transpose_kernel!(CUDADevice(), block_dims)), + ("copy", simple_copy_kernel!(CUDA.device(), block_dims)), + ("transpose", simple_transpose_kernel!(CUDA.device(), block_dims)), ) NVTX.@range "Simple $name $block_dims" let input = CUDA.rand(T, (N, N)) @@ -154,8 +154,8 @@ end # Benchmark localmem for (name, kernel) in ( - ("copy", lmem_copy_kernel!(CUDADevice(), (TILE_DIM, TILE_DIM))), - ("transpose", lmem_transpose_kernel!(CUDADevice(), (TILE_DIM, TILE_DIM))), + ("copy", lmem_copy_kernel!(CUDA.device(), (TILE_DIM, TILE_DIM))), + ("transpose", lmem_transpose_kernel!(CUDA.device(), (TILE_DIM, TILE_DIM))), ) for bank in (true, false) NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let @@ -176,8 +176,8 @@ end # Benchmark localmem + multiple elements per lane for (name, kernel) in ( - ("copy", coalesced_copy_kernel!(CUDADevice(), (TILE_DIM, BLOCK_ROWS))), - ("transpose", coalesced_transpose_kernel!(CUDADevice(), (TILE_DIM, BLOCK_ROWS))), + ("copy", coalesced_copy_kernel!(CUDA.device(), (TILE_DIM, BLOCK_ROWS))), + ("transpose", coalesced_transpose_kernel!(CUDA.device(), (TILE_DIM, BLOCK_ROWS))), ) for bank in (true, false) NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl index 43337d73..c5c4112d 100644 --- a/lib/CUDAKernels/src/CUDAKernels.jl +++ b/lib/CUDAKernels/src/CUDAKernels.jl @@ -6,10 +6,9 @@ import StaticArrays: MArray import Adapt import KernelAbstractions -export CUDADevice +using Adapt: get_computing_device -KernelAbstractions.get_device(::CUDA.CuArray) = CUDADevice() -KernelAbstractions.get_device(::CUDA.CUSPARSE.AbstractCuSparseArray) = CUDADevice() +export CUDADevice const FREE_STREAMS = CUDA.CuStream[] const STREAMS = CUDA.CuStream[] @@ -94,7 +93,7 @@ end import KernelAbstractions: Event, CPUEvent, NoneEvent, MultiEvent, CPU, GPU, isdone, failed -struct CUDADevice <: GPU end +const CUDADevice = CUDA.CuDevice struct CudaEvent <: Event event::CUDA.CuEvent @@ -103,6 +102,8 @@ end failed(::CudaEvent) = false isdone(ev::CudaEvent) = CUDA.query(ev.event) +Adapt.get_computing_device(ev::CudaEvent) = get_computing_device(ev.event) + function Event(::CUDADevice) stream = CUDA.stream() event = CUDA.CuEvent(CUDA.EVENT_DISABLE_TIMING) @@ -134,11 +135,11 @@ function wait(::CPU, ev::CudaEvent, progress=nothing) event = Base.Event() stream = next_stream() - wait(CUDADevice(), ev, nothing, stream) + dev = get_computing_device(ev) + wait(dev, ev, nothing, stream) CUDA.launch(;stream) do notify(event) end - dev = CUDA.device() # if an error occurs, the callback may never fire, so use a timer to detect such cases timer = Timer(0; interval=1) Base.@sync begin @@ -169,7 +170,7 @@ end wait(::CUDADevice, ev::CudaEvent, progress=nothing, stream=CUDA.stream()) = CUDA.wait(ev.event, stream) wait(::CUDADevice, ev::NoneEvent, progress=nothing, stream=nothing) = nothing -function wait(::CUDADevice, ev::MultiEvent, progress=nothing, stream=CUDA.stream()) +function wait(dev::CUDADevice, ev::MultiEvent, progress=nothing, stream=CUDA.stream()) dependencies = collect(ev.events) cudadeps = filter(d->d isa CudaEvent, dependencies) otherdeps = filter(d->!(d isa CudaEvent), dependencies) @@ -177,7 +178,7 @@ function wait(::CUDADevice, ev::MultiEvent, progress=nothing, stream=CUDA.stream CUDA.wait(event.event, stream) end for event in otherdeps - wait(CUDADevice(), event, progress, stream) + wait(dev, event, progress, stream) end end @@ -208,12 +209,12 @@ function __pin!(a) return nothing end -function KernelAbstractions.async_copy!(::CUDADevice, A, B; dependencies=nothing, progress=yield) +function KernelAbstractions.async_copy!(dev::CUDADevice, A, B; dependencies=nothing, progress=yield) A isa Array && __pin!(A) B isa Array && __pin!(B) stream = next_stream() - wait(CUDADevice(), MultiEvent(dependencies), progress, stream) + wait(dev, MultiEvent(dependencies), progress, stream) event = CUDA.CuEvent(CUDA.EVENT_DISABLE_TIMING) GC.@preserve A B begin destptr = pointer(A) @@ -264,7 +265,7 @@ function threads_to_workgroupsize(threads, ndrange) end end -function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(CUDADevice()), workgroupsize=nothing, progress=yield) +function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(get_computing_device(first(args))), workgroupsize=nothing, progress=yield) ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize) # this might not be the final context, since we may tune the workgroupsize @@ -294,7 +295,7 @@ function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event( end stream = next_stream() - wait(CUDADevice(), MultiEvent(dependencies), progress, stream) + wait(get_computing_device(first(args)), MultiEvent(dependencies), progress, stream) # Launch kernel event = CUDA.CuEvent(CUDA.EVENT_DISABLE_TIMING) diff --git a/lib/ROCKernels/src/ROCKernels.jl b/lib/ROCKernels/src/ROCKernels.jl index 9d2bc5f0..205e2a49 100644 --- a/lib/ROCKernels/src/ROCKernels.jl +++ b/lib/ROCKernels/src/ROCKernels.jl @@ -9,7 +9,7 @@ import KernelAbstractions export ROCDevice -KernelAbstractions.get_device(::AMDGPU.ROCArray) = ROCDevice() +get_computing_device(::AMDGPU.ROCArray) = ROCDevice() const FREE_QUEUES = HSAQueue[] @@ -60,8 +60,6 @@ end import KernelAbstractions: Event, CPUEvent, NoneEvent, MultiEvent, CPU, GPU, isdone, failed -struct ROCDevice <: GPU end - struct ROCEvent{T<:Union{AMDGPU.HSA.Signal,HSAStatusSignal}} <: Event event::T end diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 5d98efe8..dcfb22fd 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -333,28 +333,13 @@ constify(arg) = adapt(ConstAdaptor(), arg) # Backend hierarchy ### -abstract type Device end -abstract type GPU <: Device end +const Device = AbstractComputingDevice +const GPU = AbstractGPUDevice +const CPU = CPUDevice -struct CPU <: Device end +Base.@deprecate get_device(A::AbstractArray) get_computing_device(A) -""" - KernelAbstractions.get_device(A::AbstractArray)::KernelAbstractions.Device - -Get a `KernelAbstractions.Device` instance suitable for array `A`. -""" -function get_device end - -# Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.: -get_device(A::AbstractArray) = get_device(parent(A)) - -get_device(A::AbstractSparseArray) = get_device(rowvals(A)) -get_device(A::Diagonal) = get_device(A.diag) -get_device(A::Tridiagonal) = get_device(A.d) - -get_device(::Array) = CPU() - include("nditeration.jl") using .NDIteration import .NDIteration: get diff --git a/test/test.jl b/test/test.jl index fac9fa57..86215518 100644 --- a/test/test.jl +++ b/test/test.jl @@ -66,20 +66,6 @@ end A[I] = i end -@testset "get_device" begin - x = ArrayT(rand(Float32, 5)) - A = ArrayT(rand(Float32, 5,5)) - device = backend() - @test @inferred(KernelAbstractions.get_device(A)) == device - @test @inferred(KernelAbstractions.get_device(view(A, 2:4, 1:3))) == device - if !(isdefined(Main, :ROCKernels) && (device isa Main.ROCKernels.ROCDevice)) - # Sparse arrays are not supported by the ROCm backend yet: - @test @inferred(KernelAbstractions.get_device(sparse(A))) == device - end - @test @inferred(KernelAbstractions.get_device(Diagonal(x))) == device - @test @inferred(KernelAbstractions.get_device(Tridiagonal(A))) == device -end - @testset "indextest" begin # TODO: add test for _group and _local_cartesian A = ArrayT{Int}(undef, 16, 16)