JuliaGPU · pxl-th · Feb 20, 2024 · Feb 20, 2024 · Feb 20, 2024 · Feb 20, 2024
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -22,27 +22,7 @@ steps:
     if: build.message !~ /\[skip docs\]/
     timeout_in_minutes: 10
 
-  # - label: "Julia 1.9 - No Artifacts"
-  #   plugins:
-  #     - JuliaCI/julia#v1:
-  #         version: 1.9
-  #     - JuliaCI/julia-test#v1:
-  #     - JuliaCI/julia-coverage#v1:
-  #         codecov: true
-  #   agents:
-  #     queue: "juliagpu"
-  #     rocm: "*"
-  #     rocmgpu: "gfx1100"
-  #   if: build.message !~ /\[skip tests\]/
-  #   command: "julia --project -e 'using Pkg; Pkg.update()'"
-  #   timeout_in_minutes: 180
-  #   env:
-  #     JULIA_NUM_THREADS: 4
-  #     JULIA_AMDGPU_CORE_MUST_LOAD: "1"
-  #     JULIA_AMDGPU_HIP_MUST_LOAD: "1"
-  #     JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
-
-  - label: "Julia 1.10 - No Artifacts"
+  - label: "Julia 1.10"
     plugins:
       - JuliaCI/julia#v1:
           version: "1.10"

diff --git a/Project.toml b/Project.toml
@@ -20,6 +20,7 @@ Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
+PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ROCmDeviceLibs_jll = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -45,6 +46,7 @@ LLD_jll = "14, 15, 16"
 LLVM = "6"
 LLVM_jll = "14, 15, 16"
 Preferences = "1"
+PrettyTables = "2.3"
 ROCmDeviceLibs_jll = "5.6.1"
 Random123 = "1.6"
 RandomNumbers = "1.5"

diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md
@@ -1,55 +1,90 @@
+```@meta
+DocTestSetup = quote
+    using AMDGPU
+end
+```
+
 # Quick Start
 
-## Running a simple kernel
+## Simple example
+
+As a simple example, let's add two vectors both on CPU and GPU
+and make sure that the results are the same:
+
+First, we do this on CPU:
 
-As a simple test, we will try to add two random vectors
-and make sure that the results from the CPU and the GPU are indeed the same.
+```jldoctest simple-example
+julia> n = 1024;
 
-We can start by first performing this simple calculation on the CPU:
+julia> a = fill(1.0, n);
 
-```julia
-N = 1024
-a = rand(Float64, N)
-b = rand(Float64, N)
-c_cpu = a + b
+julia> b = fill(2.0, n);
+
+julia> c = a .+ b;
 ```
 
 To do the same computation on the GPU, we first need to copy
-the two input arrays `a` and `b` to the device.
-Toward that end, we will use the `ROCArray` type to represent our GPU arrays.
-We can create the two arrays by passing the host data
-to the constructor as follows:
-
-```julia
-using AMDGPU
-a_d = ROCArray(a)
-b_d = ROCArray(b)
+the host arrays to the device and then simply add them together element-wise:
+
+```jldoctest simple-example
+julia> a_d = ROCArray(a);
+
+julia> b_d = ROCArray(b);
+
+julia> c_d = a_d .+ b_d;
 ```
 
-We need to create one additional array `c_d` to store the results:
+Now, let's check that the results are the same on CPU and GPU by
+transferring GPU array back to host and comparing the results:
 
-```julia
-c_d = similar(a_d)
+```jldoctest simple-example
+julia> Array(c_d) ≈ c
+true
 ```
 
-In this example, the postfix `_d` distinguishes a device memory object
-from its host memory counterpart.
-This convention is completely arbitrary and you may name your
-device-side variables whatever you like; they are regular Julia variables.
+## Kernel example
 
-Next, we will define the GPU kernel that does the actual computation:
+Alternatively, we can perform the same computation by writing our custom
+GPU kernel:
 
-```julia
-function vadd!(c, a, b)
-    i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x
-    c[i] = a[i] + b[i]
-    return
-end
+```jldoctest simple-example
+julia> function vadd!(c, a, b)
+           i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x
+           c[i] = a[i] + b[i]
+           return
+       end
+vadd! (generic function with 1 method)
+```
+
+The index `i` of a single workitem can be uniquely identified by its grid index.
+In this case only one dimension is used, so we take only `.x` coordinate
+into account.
+
+A kernel is compiled upon its first launch.
+Subsequent launches re-use it, without recompilation.
+Let's launch a kernel, but first clear-out the memory of the resulting vector `cd`.
+
+```jldoctest simple-example
+julia> fill!(c_d, 0.0);
+
+julia> groupsize = 256;
+
+julia> gridsize = cld(n, groupsize);
+
+julia> @roc groupsize=groupsize gridsize=gridsize vadd!(c_d, a_d, b_d);
+
+julia> Array(c_d) ≈ c
+true
 ```
 
-The index of a single workitem can be uniquely identified by its grid index
-(computed linearly as `(workgroupDim().x * (workgroupIdx().x - 1)) + workitemIdx().x`
-when only a single dimension is used).
+The easiest way to launch a GPU kernel is with the `@roc` macro,
+specifying `groupsize` and `gridsize` to cover full array,
+and calling it like a regular function.
+
+Keep in mind that kernel launches are asynchronous,
+meaning that you need to synchronize before you can use the result
+(e.g. with [`AMDGPU.synchronize`](@ref)).
+However, GPU <-> CPU transfers synchronize implicitly.
 
 The grid is the domain over which the *entire* kernel executes over.
 The grid will be split into multiple workgroups by hardware automatically,
@@ -74,35 +109,6 @@ If this statement is omitted, Julia will attempt to return the value
 of the last evaluated expression, in this case a `Float64`,
 which will cause a compilation failure as kernels cannot return values.
 
-The easiest way to launch a GPU kernel is with the `@roc` macro,
-specifying `groupsize` and `gridsize` to cover full array,
-and calling it like a regular function:
-
-```julia
-groupsize = 128
-gridsize = cld(length(c_d), groupsize)
-@roc gridsize=gridsize groupsize=groupsize vadd!(c_d, a_d, b_d)
-```
-
-Keep in mind that kernel launches are asynchronous,
-meaning that you need to do some kind of synchronization before you use the result.
-For instance, you can call `AMDGPU.synchronize()`:
-
-```julia
-@roc groupsize=N vadd!(c_d, a_d, b_d)
-AMDGPU.synchronize()
-```
-
-Finally, we can make sure that the results match,
-by first copying the data to the host and then comparing it with the CPU results:
-
-```julia
-c = Array(c_d)
-
-using Test
-@test isapprox(c, c_cpu)
-```
-
 ## Naming conventions
 
 Throughout this example we use terms like "work group" and "work item".
@@ -124,8 +130,3 @@ As a quick summary, here is a mapping of the most common terms:
 | `groupsize` | `threads` |
 | `gridsize` | `blocks` |
 | `stream` | `stream` |
-
-!!! warning
-    Since AMDGPU v0.5.0 `gridsize` represents the number of "workgroups"
-    (or `blocks` in CUDA) and no longer "workitems * workgroups"
-    (or `threads * blocks` in CUDA).
diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl
@@ -10,8 +10,9 @@ using Preferences
 using Printf
 
 import Atomix: @atomic, @atomicswap, @atomicreplace
-import LinearAlgebra
 import Core: LLVMPtr
+import LinearAlgebra
+import PrettyTables
 
 const Maybe{T} = Union{Nothing, T}
 

diff --git a/src/hip/HIP.jl b/src/hip/HIP.jl
@@ -2,6 +2,7 @@ module HIP
 
 using CEnum
 import Preferences
+import PrettyTables
 
 import ..AMDGPU
 import ..AMDGPU.libhip

diff --git a/src/hip/device.jl b/src/hip/device.jl
@@ -96,10 +96,6 @@ function attribute(dev::HIPDevice, attr)
     v[]
 end
 
-function Base.show(io::IO, dev::HIPDevice)
-    print(io, "HIPDevice(name=\"$(name(dev))\", id=$(dev.device_id), gcn_arch=$(dev.gcn_arch))")
-end
-
 function ndevices()
     count_ref = Ref{Cint}()
     hipGetDeviceCount(count_ref) |> check
@@ -149,3 +145,29 @@ function can_access_peer(dev::HIPDevice, peer::HIPDevice)
     hipDeviceCanAccessPeer(result, device_id(dev), device_id(peer)) |> check
     return result[] == 1
 end
+
+# Pretty-printing.
+
+function Base.show(io::IO, dev::HIPDevice)
+    print(io, "HIPDevice(id=$(dev.device_id), name=$(name(dev)), gcn_arch=$(dev.gcn_arch))")
+end
+
+function __pretty_data(dev::HIPDevice)
+    props = properties(dev)
+    name_ptr = pointer([props.name...])
+    name = unsafe_string(name_ptr)
+    reshape(String[
+        "$(dev.device_id)", name, "$(dev.gcn_arch)",
+        "$(dev.wavefrontsize)", "$(Base.format_bytes(props.totalGlobalMem))",
+    ], 1, :)
+end
+
+function Base.show(io::IO, mime::MIME{Symbol("text/plain")}, dev::HIPDevice)
+    PrettyTables.pretty_table(io, __pretty_data(dev); header=[
+        "Id", "Name", "GCN arch", "Wavefront", "Memory"])
+end
+
+function Base.show(io::IO, mime::MIME{Symbol("text/plain")}, devs::Vector{HIPDevice})
+    PrettyTables.pretty_table(io, vcat(__pretty_data.(devs)...); header=[
+        "Id", "Name", "GCN arch", "Wavefront", "Memory"])
+end
diff --git a/src/hip/stream.jl b/src/hip/stream.jl
@@ -153,6 +153,15 @@ function Base.show(io::IO, stream::HIPStream)
     print(io, "HIPStream(device=$(stream.device), ptr=$(repr(UInt64(stream.stream))), priority=$(stream.priority))")
 end
 
+function Base.show(io::IO, mime::MIME{Symbol("text/plain")}, stream::HIPStream)
+    data = reshape([
+        "$(repr(UInt64(stream.stream)))",
+        "$(stream.priority)",
+        "$(stream.device)",
+    ], 1, :)
+    PrettyTables.pretty_table(io, data; header=["Ptr", "Priority", "Device"])
+end
+
 function priority_to_symbol(priority)
     priority ==  0 && return :normal
     priority == -1 && return :high

diff --git a/src/utils.jl b/src/utils.jl
@@ -1,60 +1,29 @@
-function versioninfo(io::IO=stdout)
+function versioninfo()
     _status(st::Bool) = st ? "+" : "-"
-    function _lib_title(name::String, sym::Symbol; version_fn::Function)
-        st = _status(functional(sym))
-        ver = (functional(sym) && version_fn ≢ identity) ? "v$(version_fn())" : ""
-        "[$st] $name $ver"
-    end
-
-    println("ROCm provided by: ", use_artifacts() ? "JLLs" : "system")
-    println("[$(_status(functional(:lld)))] ld.lld")
-    if functional(:lld)
-        println("    @ $lld_path")
-    end
-    println("[$(_status(functional(:device_libs)))] ROCm-Device-Libs")
-    if functional(:device_libs)
-        println("    @ $libdevice_libs")
-    end
-    println(_lib_title("HIP Runtime", :hip; version_fn=HIP.runtime_version))
-    if functional(:hip)
-        println("    @ $libhip")
-    end
-    println(_lib_title("rocBLAS", :rocblas; version_fn=rocBLAS.version))
-    if functional(:rocblas)
-        println("    @ $librocblas")
-    end
-    println(_lib_title("rocSOLVER", :rocsolver; version_fn=rocSOLVER.version))
-    if functional(:rocsolver)
-        println("    @ $librocsolver")
-    end
-    println("[$(_status(functional(:rocalution)))] rocALUTION")
-    if functional(:rocalution)
-        println("    @ $librocalution")
-    end
-    println("[$(_status(functional(:rocsparse)))] rocSPARSE")
-    if functional(:rocsparse)
-        println("    @ $librocsparse")
-    end
-    println(_lib_title("rocRAND", :rocrand; version_fn=rocRAND.version))
-    if functional(:rocrand)
-        println("    @ $librocrand")
-    end
-    println(_lib_title("rocFFT", :rocfft; version_fn=rocFFT.version))
-    if functional(:rocfft)
-        println("    @ $librocfft")
-    end
-    println(_lib_title("MIOpen", :MIOpen; version_fn=MIOpen.version))
-    if functional(:MIOpen)
-        println("    @ $libMIOpen_path")
-    end
+    _libpath(p::String) = isempty(p) ? "-" : p
+    _ver(lib::Symbol, ver_fn) = functional(lib) ? "$(ver_fn())" : "-"
+    data = String[
+        _status(functional(:lld))         "LLD"              "-"                                 _libpath(lld_path);
+        _status(functional(:device_libs)) "Device Libraries" "-"                                 _libpath(libdevice_libs);
+        _status(functional(:hip))         "HIP"              _ver(:hip, HIP.runtime_version)     _libpath(libhip);
+        _status(functional(:rocblas))     "rocBLAS"          _ver(:rocblas, rocBLAS.version)     _libpath(librocblas);
+        _status(functional(:rocsolver))   "rocSOLVER"        _ver(:rocsolver, rocSOLVER.version) _libpath(librocsolver);
+        _status(functional(:rocalution))  "rocALUTION"       "-"                                 _libpath(librocalution);
+        _status(functional(:rocsparse))   "rocSPARSE"        "-"                                 _libpath(librocsparse);
+        _status(functional(:rocrand))     "rocRAND"          _ver(:rocrand, rocRAND.version)     _libpath(librocrand);
+        _status(functional(:rocfft))      "rocFFT"           _ver(:rocfft, rocFFT.version)       _libpath(librocfft);
+        _status(functional(:MIOpen))      "MIOpen"           _ver(:MIOpen, MIOpen.version)       _libpath(libMIOpen_path);
+    ]
+
+    PrettyTables.pretty_table(data; header=[
+        "Available", "Name", "Version", "Path"],
+        alignment=[:c, :l, :l, :l])
 
     if functional(:hip)
         println()
-        println("HIP Devices [$(length(HIP.devices()))]")
-        for (i, device) in enumerate(HIP.devices())
-            println("    $i. ", repr(device))
-        end
+        display(AMDGPU.devices())
     end
+    return
 end
 
 """