`@synchronize` inside while loops doesn't work on CPUs #262

tom91136 · 2021-07-20T14:17:15Z

I'm trying to port the classic GPU tree reduction via KernelAbstractions.jl.
See this for the direct CUDA implementation of what I'm trying to port from.
This is what I have implemented currently:

const TBSize = 1024::Int
const DotBlocks = 256::Int

@kernel function dot(@Const(a), @Const(b), size, partial)  
    local_i = @index(Local)
    group_i = @index(Group)
    tb_sum = @localmem T TBSize
    @inbounds tb_sum[local_i] = 0.0
  
    # do dot first
    i = @index(Global)
    while i <= size
      @inbounds tb_sum[local_i] += a[i] * b[i]
      i += TBSize * DotBlocks
    end
  
    # then tree reduction
    offset = @private Int64 (1,)
    @inbounds begin
      offset[1] = @groupsize()[1] ÷ 2
      while offset[1] > 0
        @synchronize
        if (local_i - 1) < offset[1]
          tb_sum[local_i] += tb_sum[local_i+offset[1]]
        end
        offset[1] ÷= 2
      end
    end
  
    if (local_i == 1)
      @inbounds partial[group_i] = tb_sum[local_i]
    end
  end

# driver
wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))

I was able to get correct results and performance seems mostly on par with our CUDA.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/CUDAStream.jl#L112) and AMDGPU.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/AMDGPUStream.jl#L135) implementation.
On CPU however, I got the following error:

Using device: AMD Ryzen 9 3900X 12-Core Processor (1T)
ERROR: LoadError: TaskFailedException
Stacktrace:
 [1] wait
   @ ./task.jl:322 [inlined]
 [2] wait
   @ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:65 [inlined]
 [3] wait (repeats 2 times)
   @ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:29 [inlined]
 [4] runDot(device::Tuple{UndefInitializer, String, Backend}, size::Int64, ainit::Float32, binit::Float32)
   @ Main ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:102
 [5] top-level scope
   @ ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:107

    nested task error: MethodError: no method matching isless(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
    Closest candidates are:
      isless(::AbstractVector{T} where T, ::AbstractVector{T} where T) at abstractarray.jl:1989
      isless(::Any, ::Missing) at missing.jl:88
      isless(::Missing, ::Any) at missing.jl:87
      ...
    Stacktrace:
      [1] call
        @ ~/.julia/packages/Cassette/N5kbV/src/context.jl:456 [inlined]
      [2] fallback
        @ ~/.julia/packages/Cassette/N5kbV/src/context.jl:454 [inlined]
      [3] _overdub_fallback(::Any, ::Vararg{Any, N} where N)
        @ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
      [4] overdub
        @ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
      [5] <(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
        @ ./operators.jl:279 [inlined]
      [6] overdub
        @ ./operators.jl:279 [inlined]
      [7] >(::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true}, ::Int64)
        @ ./operators.jl:305 [inlined]
      [8] overdub
        @ ./operators.jl:305 [inlined]
      [9] overdub
        @ ~/.julia/packages/KernelAbstractions/8W8KX/src/KernelAbstractions.jl:266 [inlined]
     [10] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:157
     [11] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:130
     [12] (::KernelAbstractions.var"#33#34"{Nothing, Nothing, typeof(KernelAbstractions.__run), Tuple{KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, Tuple{Int64}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, KernelAbstractions.NDIteration.NoDynamicCheck}})()
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:22
in expression starting at /home/tom/babelstream-upstream/JuliaStream.jl/src/Test.jl:107

Removing the @synchronize macro in the while-loop makes the error go away but the answer becomes incorrect.
I've tried to do @print eltype(offset[1]), it prints the correct generic type (Float32 in the case) so I'm not sure what @synchronize is doing here.

For reference, here is what pkg status says:

  [21141c5a] AMDGPU v0.2.7
  [c7e460c6] ArgParse v1.1.4
  [052768ef] CUDA v3.2.1
  [72cfdca4] CUDAKernels v0.3.0
  [e2ba6199] ExprTools v0.1.3 ⚲
  [63c18a36] KernelAbstractions v0.7.0
  [d96e819e] Parameters v0.12.2
  [7eb9e9f0] ROCKernels v0.2.0
  [8ba89e20] Distributed

And the complete Test.jl reproducer:

using Base: Float32
using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU

const TBSize = 1024::Int
const DotBlocks = 256::Int

@enum Backend cuda rocm cpu
const DeviceWithRepr = Tuple{Any,String,Backend}

function list_rocm_devices()::Vector{DeviceWithRepr}
  try
    # AMDGPU.agents()'s internal iteration order isn't stable
    sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
    map(x -> (x, repr(x), rocm), sorted)
  catch
    # probably unsupported
    []
  end
end

function list_cuda_devices()::Vector{DeviceWithRepr}
  return !CUDA.functional(false) ? [] :
         map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices())
end

function devices()::Vector{DeviceWithRepr}
  cudas = list_cuda_devices()
  rocms = list_rocm_devices()
  cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)]
  vcat(cpus, cudas, rocms)
end


function runDot(device::DeviceWithRepr, size::Int, ainit::T, binit::T)::Tuple{T,T} where {T}
  (actual, name, backend) = device
  println("Using device: ", name)
  as = fill(ainit, size)
  bs = fill(binit, size)
  if backend == cpu
    partial_sum = Vector{T}(undef, DotBlocks)
    a = Vector{T}(as)
    b = Vector{T}(bs)
    backendDevice = CPU()
  elseif backend == cuda
    CUDA.device!(actual)
    partial_sum = CuArray{T}(undef, DotBlocks)
    a = CuArray{T}(as)
    b = CuArray{T}(bs)
    backendDevice = CUDADevice()
  elseif backend == rocm
    AMDGPU.DEFAULT_AGENT[] = actual
    partial_sum = ROCArray{T}(undef, DotBlocks)
    a = ROCArray{T}(as)
    b = ROCArray{T}(bs)
    backendDevice = ROCDevice()
  else
    error("unsupported backend $(backend)")
  end

  @kernel function dot(@Const(a), @Const(b), size, partial)
    local_i = @index(Local)
    group_i = @index(Group)
    tb_sum = @localmem T TBSize
    @inbounds tb_sum[local_i] = 0.0

    # do dot first
    i = @index(Global)
    while i <= size
      @inbounds tb_sum[local_i] += a[i] * b[i]
      i += TBSize * DotBlocks
    end

    # then tree reduction
    offset = @private Int64 (1,)
    @inbounds begin
      offset[1] = @groupsize()[1] ÷ 2
      while offset[1] > 0
        @synchronize
        if (local_i - 1) < offset[1]
          tb_sum[local_i] += tb_sum[local_i+offset[1]]
        end
        offset[1] ÷= 2
      end
    end

    if (local_i == 1)
      @inbounds partial[group_i] = tb_sum[local_i]
    end
  end

  function referenceDot()
    sum = zero(T)
    for i = 1:size
      @inbounds sum += a[i] * b[i]
    end
    return sum
  end

  wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))
  return (referenceDot(), sum(partial_sum))
end

device = devices()[1]
(expected, actual) = runDot(device, TBSize * 2, 1.0f0, 2.0f0)
println("actual=", actual, ", expected=", expected)

The text was updated successfully, but these errors were encountered:

tom91136 changed the title ~~@synchronize inside while loops doesn't work on CPUs~~ @synchronize inside while loops doesn't work on CPUs Jul 20, 2021

tom91136 closed this as completed Jul 22, 2021

tom91136 reopened this Jul 22, 2021

vchuravy mentioned this issue Nov 16, 2022

Simple Block Reduce Fails when using while loops #330

Open

vchuravy mentioned this issue Sep 19, 2023

groupreduction and subgroupreduction #421

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

`@synchronize` inside while loops doesn't work on CPUs #262

`@synchronize` inside while loops doesn't work on CPUs #262

tom91136 commented Jul 20, 2021

@synchronize inside while loops doesn't work on CPUs #262

@synchronize inside while loops doesn't work on CPUs #262

Comments

tom91136 commented Jul 20, 2021

`@synchronize` inside while loops doesn't work on CPUs #262

`@synchronize` inside while loops doesn't work on CPUs #262