From 36be160cefe849dad375a0ef8087bd7f3a78bb7a Mon Sep 17 00:00:00 2001
From: Oliver Schulz <oschulz@mpp.mpg.de>
Date: Sun, 22 May 2022 13:44:41 +0200
Subject: [PATCH] Support Adapt.AbstractGPUDevice

---
 docs/src/index.md                  |  2 +-
 examples/matmul.jl                 |  4 ++--
 examples/memcopy.jl                |  2 +-
 examples/memcopy_static.jl         |  2 +-
 examples/mpi.jl                    |  2 +-
 examples/naive_transpose.jl        |  4 ++--
 examples/performance.jl            | 12 ++++++------
 lib/CUDAKernels/src/CUDAKernels.jl | 25 +++++++++++++------------
 lib/ROCKernels/src/ROCKernels.jl   |  4 +---
 src/KernelAbstractions.jl          | 23 ++++-------------------
 test/test.jl                       | 14 --------------
 11 files changed, 32 insertions(+), 62 deletions(-)

diff --git a/docs/src/index.md b/docs/src/index.md
index 4982516a..ec6a2ba0 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -56,7 +56,7 @@ kernel on it instead. For example, launching on a CUDA GPU:
 ```julia
 using CUDAKernels # Required to access CUDADevice
 A = CUDA.ones(1024, 1024)
-kernel = mul2(CUDADevice(), 16)
+kernel = mul2(get_computing_device(A), 16)
 # ... the rest is the same!
 ```
 
diff --git a/examples/matmul.jl b/examples/matmul.jl
index 054f8c15..d9939820 100644
--- a/examples/matmul.jl
+++ b/examples/matmul.jl
@@ -1,4 +1,4 @@
-using KernelAbstractions, Test
+using KernelAbstractions, Adapt, Test
 include(joinpath(@__DIR__, "utils.jl")) # Load backend
 
 if has_cuda && has_cuda_gpu()
@@ -24,7 +24,7 @@ function matmul!(a, b, c)
         println("Matrix size mismatch!")
         return nothing
     end
-    device = KernelAbstractions.get_device(a)
+    device = get_computing_device(a)
     n = device isa GPU ? 256 : 4
     kernel! = matmul_kernel!(device, n)
     kernel!(a, b, c, ndrange=size(c)) 
diff --git a/examples/memcopy.jl b/examples/memcopy.jl
index b51e83d4..cbe6fe80 100644
--- a/examples/memcopy.jl
+++ b/examples/memcopy.jl
@@ -22,7 +22,7 @@ wait(event)
 if has_cuda && has_cuda_gpu()
     function mycopy!(A::CuArray, B::CuArray)
         @assert size(A) == size(B)
-        copy_kernel!(CUDADevice(), 256)(A, B, ndrange=length(A))
+        copy_kernel!(get_computing_device(A), 256)(A, B, ndrange=length(A))
     end
 
     A = CuArray{Float32}(undef, 1024)
diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl
index cac10886..5a3b5854 100644
--- a/examples/memcopy_static.jl
+++ b/examples/memcopy_static.jl
@@ -22,7 +22,7 @@ if has_cuda && has_cuda_gpu()
 
     function mycopy_static!(A::CuArray, B::CuArray)
         @assert size(A) == size(B)
-        kernel = copy_kernel!(CUDADevice(), 32, size(A)) # if size(A) varies this will cause recompilation
+        kernel = copy_kernel!(get_computing_device(A), 32, size(A)) # if size(A) varies this will cause recompilation
         kernel(A, B, ndrange=size(A))
     end
 
diff --git a/examples/mpi.jl b/examples/mpi.jl
index 369666df..3121bb8c 100644
--- a/examples/mpi.jl
+++ b/examples/mpi.jl
@@ -11,7 +11,7 @@ end
 
 using MPI
 
-device(A) = typeof(A) <: Array ? CPU() : CUDADevice()
+device(A) = typeof(A) <: Array ? CPU() : CUDA.device()
 
 function mpiyield()
     MPI.Iprobe(MPI.MPI_ANY_SOURCE, MPI.MPI_ANY_TAG, MPI.COMM_WORLD)
diff --git a/examples/naive_transpose.jl b/examples/naive_transpose.jl
index 77877647..38e75778 100644
--- a/examples/naive_transpose.jl
+++ b/examples/naive_transpose.jl
@@ -1,4 +1,4 @@
-using KernelAbstractions, Test
+using KernelAbstractions, Adapt, Test
 include(joinpath(@__DIR__, "utils.jl")) # Load backend
 
 if has_cuda && has_cuda_gpu()
@@ -17,7 +17,7 @@ function naive_transpose!(a, b)
         println("Matrix size mismatch!")
         return nothing
     end
-    device = KernelAbstractions.get_device(a)
+    device = get_computing_device(a)
     n = device isa GPU ? 256 : 4
     kernel! = naive_transpose_kernel!(device, n)
     kernel!(a, b, ndrange=size(a))
diff --git a/examples/performance.jl b/examples/performance.jl
index c223fd7f..a4f184af 100644
--- a/examples/performance.jl
+++ b/examples/performance.jl
@@ -133,8 +133,8 @@ end
 
 for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM*TILE_DIM, 1), (1, TILE_DIM*TILE_DIM))
     for (name, kernel) in ( 
-                            ("copy",      simple_copy_kernel!(CUDADevice(), block_dims)),
-                            ("transpose", simple_transpose_kernel!(CUDADevice(), block_dims)),
+                            ("copy",      simple_copy_kernel!(CUDA.device(), block_dims)),
+                            ("transpose", simple_transpose_kernel!(CUDA.device(), block_dims)),
                           )
         NVTX.@range "Simple $name $block_dims" let
             input = CUDA.rand(T, (N, N))
@@ -154,8 +154,8 @@ end
 
 # Benchmark localmem
 for (name, kernel) in ( 
-                        ("copy",      lmem_copy_kernel!(CUDADevice(), (TILE_DIM, TILE_DIM))),
-                        ("transpose", lmem_transpose_kernel!(CUDADevice(), (TILE_DIM, TILE_DIM))),
+                        ("copy",      lmem_copy_kernel!(CUDA.device(), (TILE_DIM, TILE_DIM))),
+                        ("transpose", lmem_transpose_kernel!(CUDA.device(), (TILE_DIM, TILE_DIM))),
                       )
     for bank in (true, false)
         NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let
@@ -176,8 +176,8 @@ end
 
 # Benchmark localmem + multiple elements per lane
 for (name, kernel) in ( 
-                        ("copy",      coalesced_copy_kernel!(CUDADevice(), (TILE_DIM, BLOCK_ROWS))),
-                        ("transpose", coalesced_transpose_kernel!(CUDADevice(), (TILE_DIM, BLOCK_ROWS))),
+                        ("copy",      coalesced_copy_kernel!(CUDA.device(), (TILE_DIM, BLOCK_ROWS))),
+                        ("transpose", coalesced_transpose_kernel!(CUDA.device(), (TILE_DIM, BLOCK_ROWS))),
                       )
     for bank in (true, false)
         NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let
diff --git a/lib/CUDAKernels/src/CUDAKernels.jl b/lib/CUDAKernels/src/CUDAKernels.jl
index 43337d73..c5c4112d 100644
--- a/lib/CUDAKernels/src/CUDAKernels.jl
+++ b/lib/CUDAKernels/src/CUDAKernels.jl
@@ -6,10 +6,9 @@ import StaticArrays: MArray
 import Adapt
 import KernelAbstractions
 
-export CUDADevice
+using Adapt: get_computing_device
 
-KernelAbstractions.get_device(::CUDA.CuArray) = CUDADevice()
-KernelAbstractions.get_device(::CUDA.CUSPARSE.AbstractCuSparseArray) = CUDADevice()
+export CUDADevice
 
 const FREE_STREAMS = CUDA.CuStream[]
 const STREAMS = CUDA.CuStream[]
@@ -94,7 +93,7 @@ end
 
 import KernelAbstractions: Event, CPUEvent, NoneEvent, MultiEvent, CPU, GPU, isdone, failed
 
-struct CUDADevice <: GPU end
+const CUDADevice = CUDA.CuDevice
 
 struct CudaEvent <: Event
     event::CUDA.CuEvent
@@ -103,6 +102,8 @@ end
 failed(::CudaEvent) = false
 isdone(ev::CudaEvent) = CUDA.query(ev.event)
 
+Adapt.get_computing_device(ev::CudaEvent) = get_computing_device(ev.event)
+
 function Event(::CUDADevice)
     stream = CUDA.stream()
     event = CUDA.CuEvent(CUDA.EVENT_DISABLE_TIMING)
@@ -134,11 +135,11 @@ function wait(::CPU, ev::CudaEvent, progress=nothing)
 
     event = Base.Event()
     stream = next_stream()
-    wait(CUDADevice(), ev, nothing, stream)
+    dev = get_computing_device(ev)
+    wait(dev, ev, nothing, stream)
     CUDA.launch(;stream) do
         notify(event)
     end
-    dev = CUDA.device()
     # if an error occurs, the callback may never fire, so use a timer to detect such cases
     timer = Timer(0; interval=1)
     Base.@sync begin
@@ -169,7 +170,7 @@ end
 wait(::CUDADevice, ev::CudaEvent, progress=nothing, stream=CUDA.stream()) = CUDA.wait(ev.event, stream)
 wait(::CUDADevice, ev::NoneEvent, progress=nothing, stream=nothing) = nothing
 
-function wait(::CUDADevice, ev::MultiEvent, progress=nothing, stream=CUDA.stream())
+function wait(dev::CUDADevice, ev::MultiEvent, progress=nothing, stream=CUDA.stream())
     dependencies = collect(ev.events)
     cudadeps  = filter(d->d isa CudaEvent,    dependencies)
     otherdeps = filter(d->!(d isa CudaEvent), dependencies)
@@ -177,7 +178,7 @@ function wait(::CUDADevice, ev::MultiEvent, progress=nothing, stream=CUDA.stream
         CUDA.wait(event.event, stream)
     end
     for event in otherdeps
-        wait(CUDADevice(), event, progress, stream)
+        wait(dev, event, progress, stream)
     end
 end
 
@@ -208,12 +209,12 @@ function __pin!(a)
     return nothing
 end
 
-function KernelAbstractions.async_copy!(::CUDADevice, A, B; dependencies=nothing, progress=yield)
+function KernelAbstractions.async_copy!(dev::CUDADevice, A, B; dependencies=nothing, progress=yield)
     A isa Array && __pin!(A)
     B isa Array && __pin!(B)
 
     stream = next_stream()
-    wait(CUDADevice(), MultiEvent(dependencies), progress, stream)
+    wait(dev, MultiEvent(dependencies), progress, stream)
     event = CUDA.CuEvent(CUDA.EVENT_DISABLE_TIMING)
     GC.@preserve A B begin
         destptr = pointer(A)
@@ -264,7 +265,7 @@ function threads_to_workgroupsize(threads, ndrange)
     end
 end
 
-function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(CUDADevice()), workgroupsize=nothing, progress=yield)
+function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(get_computing_device(first(args))), workgroupsize=nothing, progress=yield)
 
     ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
     # this might not be the final context, since we may tune the workgroupsize
@@ -294,7 +295,7 @@ function (obj::Kernel{CUDADevice})(args...; ndrange=nothing, dependencies=Event(
     end
 
     stream = next_stream()
-    wait(CUDADevice(), MultiEvent(dependencies), progress, stream)
+    wait(get_computing_device(first(args)), MultiEvent(dependencies), progress, stream)
 
     # Launch kernel
     event = CUDA.CuEvent(CUDA.EVENT_DISABLE_TIMING)
diff --git a/lib/ROCKernels/src/ROCKernels.jl b/lib/ROCKernels/src/ROCKernels.jl
index 9d2bc5f0..205e2a49 100644
--- a/lib/ROCKernels/src/ROCKernels.jl
+++ b/lib/ROCKernels/src/ROCKernels.jl
@@ -9,7 +9,7 @@ import KernelAbstractions
 
 export ROCDevice
 
-KernelAbstractions.get_device(::AMDGPU.ROCArray) = ROCDevice()
+get_computing_device(::AMDGPU.ROCArray) = ROCDevice()
 
 
 const FREE_QUEUES = HSAQueue[]
@@ -60,8 +60,6 @@ end
 
 import KernelAbstractions: Event, CPUEvent, NoneEvent, MultiEvent, CPU, GPU, isdone, failed
 
-struct ROCDevice <: GPU end
-
 struct ROCEvent{T<:Union{AMDGPU.HSA.Signal,HSAStatusSignal}} <: Event
     event::T
 end
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 5d98efe8..dcfb22fd 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -333,28 +333,13 @@ constify(arg) = adapt(ConstAdaptor(), arg)
 # Backend hierarchy
 ###
 
-abstract type Device end
-abstract type GPU <: Device end
+const Device = AbstractComputingDevice
+const GPU = AbstractGPUDevice
+const CPU = CPUDevice
 
-struct CPU <: Device end
+Base.@deprecate get_device(A::AbstractArray) get_computing_device(A)
 
 
-"""
-    KernelAbstractions.get_device(A::AbstractArray)::KernelAbstractions.Device
-
-Get a `KernelAbstractions.Device` instance suitable for array `A`.
-"""
-function get_device end
-
-# Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.:
-get_device(A::AbstractArray) = get_device(parent(A))
-
-get_device(A::AbstractSparseArray) = get_device(rowvals(A))
-get_device(A::Diagonal) = get_device(A.diag)
-get_device(A::Tridiagonal) = get_device(A.d)
-
-get_device(::Array) = CPU()
-
 include("nditeration.jl")
 using .NDIteration
 import .NDIteration: get
diff --git a/test/test.jl b/test/test.jl
index fac9fa57..86215518 100644
--- a/test/test.jl
+++ b/test/test.jl
@@ -66,20 +66,6 @@ end
        A[I] = i
 end
 
-@testset "get_device" begin
-    x = ArrayT(rand(Float32, 5))
-    A = ArrayT(rand(Float32, 5,5))
-    device = backend()
-    @test @inferred(KernelAbstractions.get_device(A)) == device
-    @test @inferred(KernelAbstractions.get_device(view(A, 2:4, 1:3))) == device
-    if !(isdefined(Main, :ROCKernels) && (device isa Main.ROCKernels.ROCDevice))
-        # Sparse arrays are not supported by the ROCm backend yet:
-        @test @inferred(KernelAbstractions.get_device(sparse(A))) == device
-    end
-    @test @inferred(KernelAbstractions.get_device(Diagonal(x))) == device
-    @test @inferred(KernelAbstractions.get_device(Tridiagonal(A))) == device
-end
-
 @testset "indextest" begin
     # TODO: add test for _group and _local_cartesian
     A = ArrayT{Int}(undef, 16, 16)