Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

@synchronize inside while loops doesn't work on CPUs #262

Open
tom91136 opened this issue Jul 20, 2021 · 0 comments
Open

@synchronize inside while loops doesn't work on CPUs #262

tom91136 opened this issue Jul 20, 2021 · 0 comments

Comments

@tom91136
Copy link

I'm trying to port the classic GPU tree reduction via KernelAbstractions.jl.
See this for the direct CUDA implementation of what I'm trying to port from.
This is what I have implemented currently:

const TBSize = 1024::Int
const DotBlocks = 256::Int

@kernel function dot(@Const(a), @Const(b), size, partial)  
    local_i = @index(Local)
    group_i = @index(Group)
    tb_sum = @localmem T TBSize
    @inbounds tb_sum[local_i] = 0.0
  
    # do dot first
    i = @index(Global)
    while i <= size
      @inbounds tb_sum[local_i] += a[i] * b[i]
      i += TBSize * DotBlocks
    end
  
    # then tree reduction
    offset = @private Int64 (1,)
    @inbounds begin
      offset[1] = @groupsize()[1] ÷ 2
      while offset[1] > 0
        @synchronize
        if (local_i - 1) < offset[1]
          tb_sum[local_i] += tb_sum[local_i+offset[1]]
        end
        offset[1] ÷= 2
      end
    end
  
    if (local_i == 1)
      @inbounds partial[group_i] = tb_sum[local_i]
    end
  end

# driver
wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))

I was able to get correct results and performance seems mostly on par with our CUDA.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/CUDAStream.jl#L112) and AMDGPU.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/AMDGPUStream.jl#L135) implementation.
On CPU however, I got the following error:

Using device: AMD Ryzen 9 3900X 12-Core Processor (1T)
ERROR: LoadError: TaskFailedException
Stacktrace:
 [1] wait
   @ ./task.jl:322 [inlined]
 [2] wait
   @ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:65 [inlined]
 [3] wait (repeats 2 times)
   @ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:29 [inlined]
 [4] runDot(device::Tuple{UndefInitializer, String, Backend}, size::Int64, ainit::Float32, binit::Float32)
   @ Main ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:102
 [5] top-level scope
   @ ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:107

    nested task error: MethodError: no method matching isless(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
    Closest candidates are:
      isless(::AbstractVector{T} where T, ::AbstractVector{T} where T) at abstractarray.jl:1989
      isless(::Any, ::Missing) at missing.jl:88
      isless(::Missing, ::Any) at missing.jl:87
      ...
    Stacktrace:
      [1] call
        @ ~/.julia/packages/Cassette/N5kbV/src/context.jl:456 [inlined]
      [2] fallback
        @ ~/.julia/packages/Cassette/N5kbV/src/context.jl:454 [inlined]
      [3] _overdub_fallback(::Any, ::Vararg{Any, N} where N)
        @ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
      [4] overdub
        @ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
      [5] <(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
        @ ./operators.jl:279 [inlined]
      [6] overdub
        @ ./operators.jl:279 [inlined]
      [7] >(::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true}, ::Int64)
        @ ./operators.jl:305 [inlined]
      [8] overdub
        @ ./operators.jl:305 [inlined]
      [9] overdub
        @ ~/.julia/packages/KernelAbstractions/8W8KX/src/KernelAbstractions.jl:266 [inlined]
     [10] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:157
     [11] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:130
     [12] (::KernelAbstractions.var"#33#34"{Nothing, Nothing, typeof(KernelAbstractions.__run), Tuple{KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, Tuple{Int64}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, KernelAbstractions.NDIteration.NoDynamicCheck}})()
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:22
in expression starting at /home/tom/babelstream-upstream/JuliaStream.jl/src/Test.jl:107

Removing the @synchronize macro in the while-loop makes the error go away but the answer becomes incorrect.
I've tried to do @print eltype(offset[1]), it prints the correct generic type (Float32 in the case) so I'm not sure what @synchronize is doing here.

For reference, here is what pkg status says:

  [21141c5a] AMDGPU v0.2.7
  [c7e460c6] ArgParse v1.1.4
  [052768ef] CUDA v3.2.1
  [72cfdca4] CUDAKernels v0.3.0
  [e2ba6199] ExprTools v0.1.3 ⚲
  [63c18a36] KernelAbstractions v0.7.0
  [d96e819e] Parameters v0.12.2
  [7eb9e9f0] ROCKernels v0.2.0
  [8ba89e20] Distributed

And the complete Test.jl reproducer:

using Base: Float32
using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU

const TBSize = 1024::Int
const DotBlocks = 256::Int

@enum Backend cuda rocm cpu
const DeviceWithRepr = Tuple{Any,String,Backend}

function list_rocm_devices()::Vector{DeviceWithRepr}
  try
    # AMDGPU.agents()'s internal iteration order isn't stable
    sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
    map(x -> (x, repr(x), rocm), sorted)
  catch
    # probably unsupported
    []
  end
end

function list_cuda_devices()::Vector{DeviceWithRepr}
  return !CUDA.functional(false) ? [] :
         map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices())
end

function devices()::Vector{DeviceWithRepr}
  cudas = list_cuda_devices()
  rocms = list_rocm_devices()
  cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)]
  vcat(cpus, cudas, rocms)
end


function runDot(device::DeviceWithRepr, size::Int, ainit::T, binit::T)::Tuple{T,T} where {T}
  (actual, name, backend) = device
  println("Using device: ", name)
  as = fill(ainit, size)
  bs = fill(binit, size)
  if backend == cpu
    partial_sum = Vector{T}(undef, DotBlocks)
    a = Vector{T}(as)
    b = Vector{T}(bs)
    backendDevice = CPU()
  elseif backend == cuda
    CUDA.device!(actual)
    partial_sum = CuArray{T}(undef, DotBlocks)
    a = CuArray{T}(as)
    b = CuArray{T}(bs)
    backendDevice = CUDADevice()
  elseif backend == rocm
    AMDGPU.DEFAULT_AGENT[] = actual
    partial_sum = ROCArray{T}(undef, DotBlocks)
    a = ROCArray{T}(as)
    b = ROCArray{T}(bs)
    backendDevice = ROCDevice()
  else
    error("unsupported backend $(backend)")
  end

  @kernel function dot(@Const(a), @Const(b), size, partial)
    local_i = @index(Local)
    group_i = @index(Group)
    tb_sum = @localmem T TBSize
    @inbounds tb_sum[local_i] = 0.0

    # do dot first
    i = @index(Global)
    while i <= size
      @inbounds tb_sum[local_i] += a[i] * b[i]
      i += TBSize * DotBlocks
    end

    # then tree reduction
    offset = @private Int64 (1,)
    @inbounds begin
      offset[1] = @groupsize()[1] ÷ 2
      while offset[1] > 0
        @synchronize
        if (local_i - 1) < offset[1]
          tb_sum[local_i] += tb_sum[local_i+offset[1]]
        end
        offset[1] ÷= 2
      end
    end

    if (local_i == 1)
      @inbounds partial[group_i] = tb_sum[local_i]
    end
  end

  function referenceDot()
    sum = zero(T)
    for i = 1:size
      @inbounds sum += a[i] * b[i]
    end
    return sum
  end

  wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))
  return (referenceDot(), sum(partial_sum))
end

device = devices()[1]
(expected, actual) = runDot(device, TBSize * 2, 1.0f0, 2.0f0)
println("actual=", actual, ", expected=", expected)
@tom91136 tom91136 changed the title @synchronize inside while loops doesn't work on CPUs @synchronize inside while loops doesn't work on CPUs Jul 20, 2021
@tom91136 tom91136 reopened this Jul 22, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant