From 8602a1e7b7972507ad5b34f744d4f84a28fe9b88 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 Mar 2019 15:45:10 +0100 Subject: [PATCH 01/34] Support for keyword arguments to at-cuda. --- src/execution.jl | 56 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index 8ae407f8..ff6a3404 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -63,15 +63,19 @@ end # split keyword arguments to `@cuda` into ones affecting the compiler, or the execution function split_kwargs(kwargs) + macro_kws = [:dynamic] compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs] call_kws = [:blocks, :threads, :shmem, :stream] + macro_kwargs = [] compiler_kwargs = [] call_kwargs = [] for kwarg in kwargs if Meta.isexpr(kwarg, :(=)) key,val = kwarg.args if isa(key, Symbol) - if key in compiler_kws + if key in macro_kws + push!(macro_kwargs, kwarg) + elseif key in compiler_kws push!(compiler_kwargs, kwarg) elseif key in call_kws push!(call_kwargs, kwarg) @@ -86,7 +90,7 @@ function split_kwargs(kwargs) end end - return compiler_kwargs, call_kwargs + return macro_kwargs, compiler_kwargs, call_kwargs end # assign arguments to variables, handle splatting @@ -195,20 +199,39 @@ macro cuda(ex...) args = call.args[2:end] code = quote end - compiler_kwargs, call_kwargs = split_kwargs(kwargs) + macro_kwargs, compiler_kwargs, call_kwargs = split_kwargs(kwargs) vars, var_exprs = assign_args!(code, args) - # convert the arguments, call the compiler and launch the kernel - # while keeping the original arguments alive - push!(code.args, - quote - GC.@preserve $(vars...) begin - local kernel_args = cudaconvert.(($(var_exprs...),)) - local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} - local kernel = cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) - kernel(kernel_args...; $(map(esc, call_kwargs)...)) - end - end) + # handle keyword arguments that influence the macro's behavior + dynamic = false + for kwarg in macro_kwargs + key,val = kwarg.args + if key == :dynamic + dynamic = val::Bool + else + throw(ArgumentError("Unsupported keyword argument '$key'")) + end + end + + if dynamic + # dynamic, device-side kernel launch + error("unsupported") + else + # regular, host-side kernel launch + # + # convert the arguments, call the compiler and launch the kernel + # while keeping the original arguments alive + push!(code.args, + quote + GC.@preserve $(vars...) begin + local kernel_args = cudaconvert.(($(var_exprs...),)) + local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} + local kernel = cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) + kernel(kernel_args...; $(map(esc, call_kwargs)...)) + end + end) + end + return code end @@ -310,8 +333,8 @@ end end end -# There doesn't seem to be a way to access the documentation for the call-syntax, -# so attach it to the type +# FIXME: there doesn't seem to be a way to access the documentation for the call-syntax, +# so attach it to the type """ (::Kernel)(args...; kwargs...) @@ -326,6 +349,7 @@ The following keyword arguments are supported: """ Kernel + ## other """ From 8509ce5b17b40221bc5803551cfbb4e74007ad95 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 Mar 2019 15:50:56 +0100 Subject: [PATCH 02/34] Update docs. --- src/execution.jl | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index ff6a3404..8dae43e7 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -167,13 +167,14 @@ a CUDA function upon first use, and to a certain extent arguments will be conver managed automatically using `cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is performed, scheduling a kernel launch on the current CUDA context. -Several keyword arguments are supported that influence kernel compilation and execution. For -more information, refer to the documentation of respectively [`cufunction`](@ref) and -[`CUDAnative.Kernel`](@ref) +Several keyword arguments are supported that influence the behavior of `@cuda`. +- `dynamic`: use dynamic parallelism to launch device-side kernels +- arguments that influence kernel compilation: see [`cufunction`](@ref) +- arguments that influence kernel execution: see [`CUDAnative.Kernel`](@ref) The underlying operations (argument conversion, kernel compilation, kernel call) can be performed explicitly when more control is needed, e.g. to reflect on the resource usage of a -kernel to determine the launch configuration: +kernel to determine the launch configuration. A host-side kernel launch is done as follows: args = ... GC.@preserve args begin @@ -248,11 +249,11 @@ Low-level interface to compile a function invocation for the currently-active GP a callable kernel object. For a higher-level interface, use [`@cuda`](@ref). The following keyword arguments are supported: -- minthreads: the required number of threads in a thread block. -- maxthreads: the maximum number of threads in a thread block. -- blocks_per_sm: a minimum number of thread blocks to be scheduled on a single - multiprocessor. -- maxregs: the maximum number of registers to be allocated to a single thread (only +- `minthreads`: the required number of threads in a thread block +- `maxthreads`: the maximum number of threads in a thread block +- `blocks_per_sm`: a minimum number of thread blocks to be scheduled on a single + multiprocessor +- `maxregs`: the maximum number of registers to be allocated to a single thread (only supported on LLVM 4.0+) The output of this function is automatically cached, i.e. you can simply call `cufunction` @@ -342,10 +343,10 @@ Low-level interface to call a compiled kernel, passing GPU-compatible arguments For a higher-level interface, use [`@cuda`](@ref). The following keyword arguments are supported: -- threads (defaults to 1) -- blocks (defaults to 1) -- shmem (defaults to 0) -- stream (defaults to the default stream) +- `threads` (defaults to 1) +- `blocks` (defaults to 1) +- `shmem` (defaults to 0) +- `stream` (defaults to the default stream) """ Kernel From 3e238101c2573f53154d4852b15e6aca3c19a918 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 Mar 2019 18:08:53 +0100 Subject: [PATCH 03/34] Support for detecting dynamic calls and linking code. --- src/compiler/driver.jl | 86 +++++++++++++++++++++++---------- src/device/cuda/libcudadevrt.jl | 2 +- src/execution.jl | 24 ++++++++- src/reflection.jl | 3 +- test/codegen.jl | 6 +-- 5 files changed, 90 insertions(+), 31 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 89bb67f5..3273da8a 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -12,10 +12,16 @@ function module respectively of type `CuFuction` and `CuModule`. For a list of supported keyword arguments, refer to the documentation of [`cufunction`](@ref). """ -function compile(dev::CuDevice, @nospecialize(f::Core.Function), @nospecialize(tt); kwargs...) +compile(dev::CuDevice, @nospecialize(f::Core.Function), @nospecialize(tt); + kernel=true, kwargs...) = + compile(CompilerContext(f, tt, supported_capability(dev), kernel; kwargs...)) + +function compile(ctx::CompilerContext) CUDAnative.configured || error("CUDAnative.jl has not been configured; cannot JIT code.") - module_asm, module_entry = compile(supported_capability(dev), f, tt; kwargs...) + # generate code + ir, entry = codegen(ctx) + asm = mcgen(ctx, ir, entry) # enable debug options based on Julia's debug setting jit_options = Dict{CUDAdrv.CUjit_option,Any}() @@ -25,27 +31,23 @@ function compile(dev::CuDevice, @nospecialize(f::Core.Function), @nospecialize(t jit_options[CUDAdrv.GENERATE_DEBUG_INFO] = true end - # Link libcudadevrt + # link the CUDA device library linker = CUDAdrv.CuLink(jit_options) CUDAdrv.add_file!(linker, libcudadevrt, CUDAdrv.LIBRARY) - CUDAdrv.add_data!(linker, module_entry, module_asm) + CUDAdrv.add_data!(linker, LLVM.name(entry), asm) image = CUDAdrv.complete(linker) cuda_mod = CuModule(image, jit_options) - cuda_fun = CuFunction(cuda_mod, module_entry) + cuda_fun = CuFunction(cuda_mod, LLVM.name(entry)) return cuda_fun, cuda_mod end -# same as above, but without an active device -function compile(cap::VersionNumber, @nospecialize(f), @nospecialize(tt); - kernel=true, kwargs...) - ctx = CompilerContext(f, tt, cap, kernel; kwargs...) - - return compile(ctx) -end +codegen(cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt); + kernel=true, kwargs...) = + codegen(CompilerContext(f, tt, cap, kernel; kwargs...)) -function compile(ctx::CompilerContext) +function codegen(ctx::CompilerContext) if compile_hook[] != nothing hook = compile_hook[] compile_hook[] = nothing @@ -69,38 +71,72 @@ function compile(ctx::CompilerContext) ## low-level code generation (LLVM IR) - mod, entry = irgen(ctx) + ir, entry = irgen(ctx) need_library(lib) = any(f -> isdeclaration(f) && intrinsic_id(f) == 0 && haskey(functions(lib), LLVM.name(f)), - functions(mod)) + functions(ir)) libdevice = load_libdevice(ctx.cap) if need_library(libdevice) - link_libdevice!(ctx, mod, libdevice) + link_libdevice!(ctx, ir, libdevice) end # optimize the IR - entry = optimize!(ctx, mod, entry) + entry = optimize!(ctx, ir, entry) runtime = load_runtime(ctx.cap) if need_library(runtime) - link_library!(ctx, mod, runtime) + link_library!(ctx, ir, runtime) end - prepare_execution!(ctx, mod) + prepare_execution!(ctx, ir) check_invocation(ctx, entry) - # check generated IR - check_ir(ctx, mod) - verify(mod) + + ## dynamic parallelism + + # find dynamic kernel invocations + dyn_calls = [] + if haskey(functions(ir), "cudanativeLaunchDevice") + f = functions(ir)["cudanativeLaunchDevice"] + for use in uses(f) + # decode the call + # FIXME: recover this earlier, from the Julia IR + call = user(use)::LLVM.CallInst + ops = collect(operands(call))[1:2] + ## addrspacecast + ops = LLVM.Value[first(operands(val)) for val in ops] + ## inttoptr + ops = ConstantInt[first(operands(val)) for val in ops] + ## integer constants + ops = convert.(Int, ops) + ## actual pointer values + ops = Ptr{Any}.(ops) + + dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops) + push!(dyn_calls, (call, dyn_f, dyn_tt)) + end + end + + # compile and link + for (call, dyn_f, dyn_tt) in dyn_calls + dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true) + dyn_ir, dyn_entry = codegen(dyn_ctx) + link_library!(ctx, ir, dyn_ir) + + # TODO + unsafe_delete!(LLVM.parent(call), call) + end - ## machine code generation (PTX assembly) + ## finalization - module_asm = mcgen(ctx, mod, entry) + # check generated IR + check_ir(ctx, ir) + verify(ir) - return module_asm, LLVM.name(entry) + return ir, entry end diff --git a/src/device/cuda/libcudadevrt.jl b/src/device/cuda/libcudadevrt.jl index 861d7d9b..39faf3b2 100644 --- a/src/device/cuda/libcudadevrt.jl +++ b/src/device/cuda/libcudadevrt.jl @@ -13,4 +13,4 @@ and should not be called from the host. `synchronize` acts as a synchronization point for child grids in the context of dynamic parallelism. """ -@inline synchronize() = @wrap cudaDeviceSynchronize()::i32 +@inline synchronize() = ccall("extern cudaDeviceSynchronize", llvmcall, Cint, ()) diff --git a/src/execution.jl b/src/execution.jl index 8dae43e7..c2dab477 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -216,7 +216,16 @@ macro cuda(ex...) if dynamic # dynamic, device-side kernel launch - error("unsupported") + # + # WIP + # TODO: GC.@preserve? + # TODO: error on, or support kwargs + kernel_args = var_exprs # already in kernel land, so don't need a conversion + push!(code.args, + quote + local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)} + dynamic_cufunction($(esc(f)), kernel_tt) + end) else # regular, host-side kernel launch # @@ -236,6 +245,19 @@ macro cuda(ex...) return code end +@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) + tt = Base.to_tuple_type(tt.parameters[1]) + sig = Base.signature_type(f, tt) + t = Tuple(tt.parameters) + # TODO: closures + + quote + # drop the f and tt into the module, and recover them later during compilation + ccall("extern cudanativeLaunchDevice", llvmcall, Nothing, (Any, Any), f, tt) + nothing + end +end + ## APIs for manual compilation diff --git a/src/reflection.jl b/src/reflection.jl index c8760ff7..fc5bb388 100644 --- a/src/reflection.jl +++ b/src/reflection.jl @@ -112,7 +112,8 @@ function code_sass(io::IO, ctx::CompilerContext) error("Your CUDA installation does not provide ptxas or nvdisasm, both of which are required for code_sass") end - ptx,_ = compile(ctx) + ir, entry = codegen(ctx) + ptx = mcgen(ctx, ir, entry) fn = tempname() gpu = "sm_$(ctx.cap.major)$(ctx.cap.minor)" diff --git a/test/codegen.jl b/test/codegen.jl index 89d44e71..68eada07 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -461,7 +461,7 @@ end @testset "non-isbits arguments" begin foobar(i) = (sink(unsafe_trunc(Int,i)); return) - @test_throws_message(CUDAnative.KernelError, CUDAnative.compile(v"3.5", foobar, Tuple{BigInt})) do msg + @test_throws_message(CUDAnative.KernelError, CUDAnative.codegen(v"3.5", foobar, Tuple{BigInt})) do msg occursin("passing and using non-bitstype argument", msg) && occursin("BigInt", msg) end @@ -470,7 +470,7 @@ end @testset "invalid LLVM IR" begin foobar(i) = println(i) - @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Int})) do msg + @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.codegen(v"3.5", foobar, Tuple{Int})) do msg occursin("invalid LLVM IR", msg) && occursin(CUDAnative.RUNTIME_FUNCTION, msg) && occursin("[1] println", msg) && @@ -481,7 +481,7 @@ end @testset "invalid LLVM IR (ccall)" begin foobar(p) = (unsafe_store!(p, ccall(:time, Cint, ())); nothing) - @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Ptr{Int}})) do msg + @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.codegen(v"3.5", foobar, Tuple{Ptr{Int}})) do msg occursin("invalid LLVM IR", msg) && occursin(CUDAnative.POINTER_FUNCTION, msg) && occursin(r"\[1\] .+foobar", msg) From 56b59e388a8ff32cce8693ce86e85add1a70d77a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 Mar 2019 18:22:41 +0100 Subject: [PATCH 04/34] Make codegen() support invalid code. --- src/compiler/driver.jl | 20 +++++++++----------- src/execution.jl | 4 +++- test/codegen.jl | 6 +++--- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 3273da8a..2abf7a9a 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -4,23 +4,27 @@ const compile_hook = Ref{Union{Nothing,Function}}(nothing) """ - compile(dev::CuDevice, f, tt; kwargs...) + compile(cap::VersionNumber, f, tt; kernel=true, kwargs...) -Compile a function `f` invoked with types `tt` for device `dev`, returning the compiled -function module respectively of type `CuFuction` and `CuModule`. +Compile a function `f` invoked with types `tt` for device `dev` or its compute capability +`cap`, returning the compiled function module respectively of type `CuFuction` and +`CuModule`. For a list of supported keyword arguments, refer to the documentation of [`cufunction`](@ref). """ -compile(dev::CuDevice, @nospecialize(f::Core.Function), @nospecialize(tt); +compile(cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt); kernel=true, kwargs...) = - compile(CompilerContext(f, tt, supported_capability(dev), kernel; kwargs...)) + compile(CompilerContext(f, tt, cap, kernel; kwargs...)) function compile(ctx::CompilerContext) CUDAnative.configured || error("CUDAnative.jl has not been configured; cannot JIT code.") # generate code ir, entry = codegen(ctx) + check_invocation(ctx, entry) + check_ir(ctx, ir) + verify(ir) asm = mcgen(ctx, ir, entry) # enable debug options based on Julia's debug setting @@ -93,8 +97,6 @@ function codegen(ctx::CompilerContext) prepare_execution!(ctx, ir) - check_invocation(ctx, entry) - ## dynamic parallelism @@ -134,9 +136,5 @@ function codegen(ctx::CompilerContext) ## finalization - # check generated IR - check_ir(ctx, ir) - verify(ir) - return ir, entry end diff --git a/src/execution.jl b/src/execution.jl index c2dab477..3609e594 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -312,7 +312,9 @@ when function changes, or when different types or keyword arguments are provided key = hash(getfield(f, nf), key) end if !haskey(compilecache, key) - fun, mod = compile(device(ctx), f, tt; kwargs...) + dev = device(ctx) + cap = supported_capability(dev) + fun, mod = compile(cap, f, tt; kwargs...) kernel = Kernel{f,tt}(ctx, mod, fun) @debug begin ver = version(kernel) diff --git a/test/codegen.jl b/test/codegen.jl index 68eada07..89d44e71 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -461,7 +461,7 @@ end @testset "non-isbits arguments" begin foobar(i) = (sink(unsafe_trunc(Int,i)); return) - @test_throws_message(CUDAnative.KernelError, CUDAnative.codegen(v"3.5", foobar, Tuple{BigInt})) do msg + @test_throws_message(CUDAnative.KernelError, CUDAnative.compile(v"3.5", foobar, Tuple{BigInt})) do msg occursin("passing and using non-bitstype argument", msg) && occursin("BigInt", msg) end @@ -470,7 +470,7 @@ end @testset "invalid LLVM IR" begin foobar(i) = println(i) - @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.codegen(v"3.5", foobar, Tuple{Int})) do msg + @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Int})) do msg occursin("invalid LLVM IR", msg) && occursin(CUDAnative.RUNTIME_FUNCTION, msg) && occursin("[1] println", msg) && @@ -481,7 +481,7 @@ end @testset "invalid LLVM IR (ccall)" begin foobar(p) = (unsafe_store!(p, ccall(:time, Cint, ())); nothing) - @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.codegen(v"3.5", foobar, Tuple{Ptr{Int}})) do msg + @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Ptr{Int}})) do msg occursin("invalid LLVM IR", msg) && occursin(CUDAnative.POINTER_FUNCTION, msg) && occursin(r"\[1\] .+foobar", msg) From d43a3b143f7afc30b572344abbd5a3509988d549 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 Mar 2019 20:34:42 +0100 Subject: [PATCH 05/34] Restructure the main compiler entry-point for better reuse. --- src/compiler/driver.jl | 152 +++++++++++++++++++++++------------------ src/execution.jl | 2 +- src/reflection.jl | 25 ++----- test/codegen.jl | 6 +- 4 files changed, 95 insertions(+), 90 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 2abf7a9a..ef410aad 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -4,76 +4,55 @@ const compile_hook = Ref{Union{Nothing,Function}}(nothing) """ - compile(cap::VersionNumber, f, tt; kernel=true, kwargs...) + compile(to::Symbol, cap::VersionNumber, f, tt; + kernel=true, optimize=true, strip=false, hooked=false, + kwargs...) -Compile a function `f` invoked with types `tt` for device `dev` or its compute capability -`cap`, returning the compiled function module respectively of type `CuFuction` and -`CuModule`. +Compile a function `f` invoked with types `tt` for device capability `cap` to one of the +following formats as specified by the `to` argument: `:julia` for Julia IR, `:llvm` for LLVM +IR, `:ptx` for PTX assembly and `:cuda` for CUDA driver objects. -For a list of supported keyword arguments, refer to the documentation of -[`cufunction`](@ref). -""" -compile(cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt); - kernel=true, kwargs...) = - compile(CompilerContext(f, tt, cap, kernel; kwargs...)) - -function compile(ctx::CompilerContext) - CUDAnative.configured || error("CUDAnative.jl has not been configured; cannot JIT code.") - - # generate code - ir, entry = codegen(ctx) - check_invocation(ctx, entry) - check_ir(ctx, ir) - verify(ir) - asm = mcgen(ctx, ir, entry) - - # enable debug options based on Julia's debug setting - jit_options = Dict{CUDAdrv.CUjit_option,Any}() - if Base.JLOptions().debug_level == 1 - jit_options[CUDAdrv.GENERATE_LINE_INFO] = true - elseif Base.JLOptions().debug_level >= 2 - jit_options[CUDAdrv.GENERATE_DEBUG_INFO] = true - end +The following keyword arguments are supported: +- `kernel`: enable kernel-specific code generation +- `optimize`: optimize the code +- `strip`: strip non-functional metadata and debug information - # link the CUDA device library - linker = CUDAdrv.CuLink(jit_options) - CUDAdrv.add_file!(linker, libcudadevrt, CUDAdrv.LIBRARY) - CUDAdrv.add_data!(linker, LLVM.name(entry), asm) - image = CUDAdrv.complete(linker) - - cuda_mod = CuModule(image, jit_options) - cuda_fun = CuFunction(cuda_mod, LLVM.name(entry)) - - return cuda_fun, cuda_mod -end - -codegen(cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt); - kernel=true, kwargs...) = - codegen(CompilerContext(f, tt, cap, kernel; kwargs...)) - -function codegen(ctx::CompilerContext) - if compile_hook[] != nothing - hook = compile_hook[] - compile_hook[] = nothing - - global globalUnique - previous_globalUnique = globalUnique - - hook(ctx) - - globalUnique = previous_globalUnique - compile_hook[] = hook +Other keyword arguments can be found in the documentation of [`cufunction`](@ref). +""" +compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt); + kernel::Bool=true, optimize::Bool=true, strip::Bool=false, hooked::Bool=false, + kwargs...) = + compile(to, CompilerContext(f, tt, cap, kernel; kwargs...); + optimize=optimize, strip=strip, hooked=hooked) + +function compile(to::Symbol, ctx::CompilerContext; + optimize::Bool=true, strip::Bool=false, hooked::Bool=false) + if !hooked + @debug "(Re)compiling function" ctx + if compile_hook[] != nothing + hook = compile_hook[] + compile_hook[] = nothing + + global globalUnique + previous_globalUnique = globalUnique + + hook(ctx) + + globalUnique = previous_globalUnique + compile_hook[] = hook + end end - ## high-level code generation (Julia AST) - - @debug "(Re)compiling function" ctx + ## Julia IR check_method(ctx) + # TODO: get the method here, don't put it in the context? + #to == :julia && return asm - ## low-level code generation (LLVM IR) + + ## LLVM IR ir, entry = irgen(ctx) @@ -88,14 +67,20 @@ function codegen(ctx::CompilerContext) end # optimize the IR - entry = optimize!(ctx, ir, entry) + if optimize + entry = optimize!(ctx, ir, entry) + end runtime = load_runtime(ctx.cap) if need_library(runtime) link_library!(ctx, ir, runtime) end - prepare_execution!(ctx, ir) + verify(ir) + + if strip + strip_debuginfo!(ir) + end ## dynamic parallelism @@ -126,15 +111,50 @@ function codegen(ctx::CompilerContext) # compile and link for (call, dyn_f, dyn_tt) in dyn_calls dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true) - dyn_ir, dyn_entry = codegen(dyn_ctx) - link_library!(ctx, ir, dyn_ir) + dyn_ir, dyn_entry = + compile(:llvm, dyn_ctx; optimize=optimize, strip=strip, hooked=hooked) + link!(ir, dyn_ir) # TODO unsafe_delete!(LLVM.parent(call), call) end + to == :llvm && return ir, entry + + + ## PTX machine code + + prepare_execution!(ctx, ir) + + check_invocation(ctx, entry) + check_ir(ctx, ir) + + asm = mcgen(ctx, ir, entry) + + to == :ptx && return asm, LLVM.name(entry) + + + ## CUDA objects + + # enable debug options based on Julia's debug setting + jit_options = Dict{CUDAdrv.CUjit_option,Any}() + if Base.JLOptions().debug_level == 1 + jit_options[CUDAdrv.GENERATE_LINE_INFO] = true + elseif Base.JLOptions().debug_level >= 2 + jit_options[CUDAdrv.GENERATE_DEBUG_INFO] = true + end + + # link the CUDA device library + linker = CUDAdrv.CuLink(jit_options) + CUDAdrv.add_file!(linker, libcudadevrt, CUDAdrv.LIBRARY) + CUDAdrv.add_data!(linker, LLVM.name(entry), asm) + image = CUDAdrv.complete(linker) + + cuda_mod = CuModule(image, jit_options) + cuda_fun = CuFunction(cuda_mod, LLVM.name(entry)) + + to == :cuda && return cuda_fun, cuda_mod - ## finalization - return ir, entry + error("Unknown compilation target $to") end diff --git a/src/execution.jl b/src/execution.jl index 3609e594..0f96e355 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -314,7 +314,7 @@ when function changes, or when different types or keyword arguments are provided if !haskey(compilecache, key) dev = device(ctx) cap = supported_capability(dev) - fun, mod = compile(cap, f, tt; kwargs...) + fun, mod = compile(:cuda, cap, f, tt; kwargs...) kernel = Kernel{f,tt}(ctx, mod, fun) @debug begin ver = version(kernel) diff --git a/src/reflection.jl b/src/reflection.jl index fc5bb388..5aa543b7 100644 --- a/src/reflection.jl +++ b/src/reflection.jl @@ -37,16 +37,9 @@ function code_llvm(io::IO, @nospecialize(func::Core.Function), @nospecialize(typ end function code_llvm(io::IO, ctx::CompilerContext; optimize::Bool=true, dump_module::Bool=false, strip_ir_metadata::Bool=true) - check_method(ctx) - mod, entry = irgen(ctx) - if optimize - entry = optimize!(ctx, mod, entry) - end - if strip_ir_metadata - strip_debuginfo!(mod) - end + ir, entry = compile(:llvm, ctx; optimize=optimize, strip=strip_ir_metadata) if dump_module - show(io, mod) + show(io, ir) else show(io, entry) end @@ -74,15 +67,8 @@ function code_ptx(io::IO, @nospecialize(func::Core.Function), @nospecialize(type code_ptx(io, ctx; strip_ir_metadata=strip_ir_metadata) end function code_ptx(io::IO, ctx::CompilerContext; strip_ir_metadata::Bool=true) - check_method(ctx) - mod, entry = irgen(ctx) - entry = optimize!(ctx, mod, entry) - if strip_ir_metadata - strip_debuginfo!(mod) - end - prepare_execution!(ctx, mod) - ptx = mcgen(ctx, mod, entry) - print(io, ptx) + asm, _ = compile(:ptx, ctx; strip=strip_ir_metadata) + print(io, asm) end code_ptx(@nospecialize(func), @nospecialize(types); kwargs...) = code_ptx(stdout, func, types; kwargs...) @@ -112,8 +98,7 @@ function code_sass(io::IO, ctx::CompilerContext) error("Your CUDA installation does not provide ptxas or nvdisasm, both of which are required for code_sass") end - ir, entry = codegen(ctx) - ptx = mcgen(ctx, ir, entry) + ptx, _ = compile(:ptx, ctx) fn = tempname() gpu = "sm_$(ctx.cap.major)$(ctx.cap.minor)" diff --git a/test/codegen.jl b/test/codegen.jl index 89d44e71..23874ecf 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -461,7 +461,7 @@ end @testset "non-isbits arguments" begin foobar(i) = (sink(unsafe_trunc(Int,i)); return) - @test_throws_message(CUDAnative.KernelError, CUDAnative.compile(v"3.5", foobar, Tuple{BigInt})) do msg + @test_throws_message(CUDAnative.KernelError, CUDAnative.compile(:ptx, v"3.5", foobar, Tuple{BigInt})) do msg occursin("passing and using non-bitstype argument", msg) && occursin("BigInt", msg) end @@ -470,7 +470,7 @@ end @testset "invalid LLVM IR" begin foobar(i) = println(i) - @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Int})) do msg + @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(:ptx, v"3.5", foobar, Tuple{Int})) do msg occursin("invalid LLVM IR", msg) && occursin(CUDAnative.RUNTIME_FUNCTION, msg) && occursin("[1] println", msg) && @@ -481,7 +481,7 @@ end @testset "invalid LLVM IR (ccall)" begin foobar(p) = (unsafe_store!(p, ccall(:time, Cint, ())); nothing) - @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Ptr{Int}})) do msg + @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(:ptx, v"3.5", foobar, Tuple{Ptr{Int}})) do msg occursin("invalid LLVM IR", msg) && occursin(CUDAnative.POINTER_FUNCTION, msg) && occursin(r"\[1\] .+foobar", msg) From d787f70d757a042da36e9171895dcd5e165bb427 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 19 Mar 2019 22:16:17 +0100 Subject: [PATCH 06/34] Don't double-report dynamic kernel compilations. --- src/compiler/driver.jl | 65 ++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index ef410aad..344f15cb 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -5,8 +5,7 @@ const compile_hook = Ref{Union{Nothing,Function}}(nothing) """ compile(to::Symbol, cap::VersionNumber, f, tt; - kernel=true, optimize=true, strip=false, hooked=false, - kwargs...) + kernel=true, optimize=true, strip=false, ...) Compile a function `f` invoked with types `tt` for device capability `cap` to one of the following formats as specified by the `to` argument: `:julia` for Julia IR, `:llvm` for LLVM @@ -20,27 +19,25 @@ The following keyword arguments are supported: Other keyword arguments can be found in the documentation of [`cufunction`](@ref). """ compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt); - kernel::Bool=true, optimize::Bool=true, strip::Bool=false, hooked::Bool=false, - kwargs...) = + kernel::Bool=true, optimize::Bool=true, strip::Bool=false, kwargs...) = compile(to, CompilerContext(f, tt, cap, kernel; kwargs...); - optimize=optimize, strip=strip, hooked=hooked) + optimize=optimize, strip=strip) function compile(to::Symbol, ctx::CompilerContext; - optimize::Bool=true, strip::Bool=false, hooked::Bool=false) - if !hooked - @debug "(Re)compiling function" ctx - if compile_hook[] != nothing - hook = compile_hook[] - compile_hook[] = nothing + optimize::Bool=true, strip::Bool=false) + @debug "(Re)compiling function" ctx - global globalUnique - previous_globalUnique = globalUnique + if compile_hook[] != nothing + hook = compile_hook[] + compile_hook[] = nothing - hook(ctx) + global globalUnique + previous_globalUnique = globalUnique - globalUnique = previous_globalUnique - compile_hook[] = hook - end + hook(ctx) + + globalUnique = previous_globalUnique + compile_hook[] = hook end @@ -85,9 +82,10 @@ function compile(to::Symbol, ctx::CompilerContext; ## dynamic parallelism - # find dynamic kernel invocations - dyn_calls = [] if haskey(functions(ir), "cudanativeLaunchDevice") + dyn_calls = [] + + # find dynamic kernel invocations f = functions(ir)["cudanativeLaunchDevice"] for use in uses(f) # decode the call @@ -106,17 +104,28 @@ function compile(to::Symbol, ctx::CompilerContext; dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops) push!(dyn_calls, (call, dyn_f, dyn_tt)) end - end - # compile and link - for (call, dyn_f, dyn_tt) in dyn_calls - dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true) - dyn_ir, dyn_entry = - compile(:llvm, dyn_ctx; optimize=optimize, strip=strip, hooked=hooked) - link!(ir, dyn_ir) + # compile and link + for (call, dyn_f, dyn_tt) in dyn_calls + # disable the compile hook; this recursive compilation call + # shouldn't be traced separately + hook = compile_hook[] + compile_hook[] = nothing + + dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true) + dyn_ir, dyn_entry = + compile(:llvm, dyn_ctx; optimize=optimize, strip=strip) + + compile_hook[] = hook + + link!(ir, dyn_ir) + + # TODO + unsafe_delete!(LLVM.parent(call), call) + end - # TODO - unsafe_delete!(LLVM.parent(call), call) + @compiler_assert isempty(uses(f)) ctx + unsafe_delete!(ir, f) end to == :llvm && return ir, entry From f20580234e67f3ed59ae75180700a1f0b62e7ff6 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Mar 2019 09:05:20 +0100 Subject: [PATCH 07/34] Improve hook handline, and reuse compiler driver for rtlib generation. --- src/compiler/driver.jl | 37 ++++++++++++++----------------------- src/compiler/rtlib.jl | 5 +---- src/reflection.jl | 6 +++--- 3 files changed, 18 insertions(+), 30 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 344f15cb..1fa8bab6 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -4,40 +4,38 @@ const compile_hook = Ref{Union{Nothing,Function}}(nothing) """ - compile(to::Symbol, cap::VersionNumber, f, tt; + compile(to::Symbol, cap::VersionNumber, f, tt, kernel=true; kernel=true, optimize=true, strip=false, ...) Compile a function `f` invoked with types `tt` for device capability `cap` to one of the following formats as specified by the `to` argument: `:julia` for Julia IR, `:llvm` for LLVM -IR, `:ptx` for PTX assembly and `:cuda` for CUDA driver objects. +IR, `:ptx` for PTX assembly and `:cuda` for CUDA driver objects. If the `kernel` flag is +set, specialized code generation and optimization for kernel functions is enabled. The following keyword arguments are supported: -- `kernel`: enable kernel-specific code generation -- `optimize`: optimize the code -- `strip`: strip non-functional metadata and debug information +- `hooks`: enable compiler hooks that drive reflection functions (default: true) +- `optimize`: optimize the code (default: true) +- `strip`: strip non-functional metadata and debug information (default: false) Other keyword arguments can be found in the documentation of [`cufunction`](@ref). """ -compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt); - kernel::Bool=true, optimize::Bool=true, strip::Bool=false, kwargs...) = +compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt), + kernel::Bool=true; hooks::Bool=true, optimize::Bool=true, strip::Bool=false, + kwargs...) = compile(to, CompilerContext(f, tt, cap, kernel; kwargs...); - optimize=optimize, strip=strip) + hooks=hooks, optimize=optimize, strip=strip) function compile(to::Symbol, ctx::CompilerContext; - optimize::Bool=true, strip::Bool=false) + hooks::Bool=true, optimize::Bool=true, strip::Bool=false) @debug "(Re)compiling function" ctx - if compile_hook[] != nothing - hook = compile_hook[] - compile_hook[] = nothing - + if hooks && compile_hook[] != nothing global globalUnique previous_globalUnique = globalUnique - hook(ctx) + compile_hook[](ctx) globalUnique = previous_globalUnique - compile_hook[] = hook end @@ -107,16 +105,9 @@ function compile(to::Symbol, ctx::CompilerContext; # compile and link for (call, dyn_f, dyn_tt) in dyn_calls - # disable the compile hook; this recursive compilation call - # shouldn't be traced separately - hook = compile_hook[] - compile_hook[] = nothing - dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true) dyn_ir, dyn_entry = - compile(:llvm, dyn_ctx; optimize=optimize, strip=strip) - - compile_hook[] = hook + compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip) link!(ir, dyn_ir) diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl index 81abb66d..58b6c9a5 100644 --- a/src/compiler/rtlib.jl +++ b/src/compiler/rtlib.jl @@ -124,11 +124,8 @@ end function emit_function!(mod, cap, f, types, name) tt = Base.to_tuple_type(types) - ctx = CompilerContext(f, tt, cap, #= kernel =# false) - new_mod, entry = irgen(ctx) - entry = optimize!(ctx, new_mod, entry) + new_mod, entry = compile(:llvm, cap, f, tt, #=kernel=# false; hooks=false) LLVM.name!(entry, name) - link!(mod, new_mod) end diff --git a/src/reflection.jl b/src/reflection.jl index 5aa543b7..1b23388f 100644 --- a/src/reflection.jl +++ b/src/reflection.jl @@ -37,7 +37,7 @@ function code_llvm(io::IO, @nospecialize(func::Core.Function), @nospecialize(typ end function code_llvm(io::IO, ctx::CompilerContext; optimize::Bool=true, dump_module::Bool=false, strip_ir_metadata::Bool=true) - ir, entry = compile(:llvm, ctx; optimize=optimize, strip=strip_ir_metadata) + ir, entry = compile(:llvm, ctx; hooks=false, optimize=optimize, strip=strip_ir_metadata) if dump_module show(io, ir) else @@ -67,7 +67,7 @@ function code_ptx(io::IO, @nospecialize(func::Core.Function), @nospecialize(type code_ptx(io, ctx; strip_ir_metadata=strip_ir_metadata) end function code_ptx(io::IO, ctx::CompilerContext; strip_ir_metadata::Bool=true) - asm, _ = compile(:ptx, ctx; strip=strip_ir_metadata) + asm, _ = compile(:ptx, ctx; hooks=false, strip=strip_ir_metadata) print(io, asm) end code_ptx(@nospecialize(func), @nospecialize(types); kwargs...) = @@ -98,7 +98,7 @@ function code_sass(io::IO, ctx::CompilerContext) error("Your CUDA installation does not provide ptxas or nvdisasm, both of which are required for code_sass") end - ptx, _ = compile(:ptx, ctx) + ptx, _ = compile(:ptx, ctx; hooks=false) fn = tempname() gpu = "sm_$(ctx.cap.major)$(ctx.cap.minor)" From 6ab86a2faed3b4f0eb07901aeae40e1b7747cfef Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Mar 2019 09:15:57 +0100 Subject: [PATCH 08/34] Move Julia irgen to compiler driver. --- src/compiler/driver.jl | 19 ++++++++++++++++--- src/compiler/irgen.jl | 19 ++----------------- src/compiler/validation.jl | 2 ++ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 1fa8bab6..75a2514e 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -43,13 +43,26 @@ function compile(to::Symbol, ctx::CompilerContext; check_method(ctx) - # TODO: get the method here, don't put it in the context? - #to == :julia && return asm + # get the method instance + world = typemax(UInt) + meth = which(ctx.f, ctx.tt) + sig = Base.signature_type(ctx.f, ctx.tt)::Type + (ti, env) = ccall(:jl_type_intersection_with_env, Any, + (Any, Any), sig, meth.sig)::Core.SimpleVector + if VERSION >= v"1.2.0-DEV.320" + meth = Base.func_for_method_checked(meth, ti, env) + else + meth = Base.func_for_method_checked(meth, ti) + end + linfo = ccall(:jl_specializations_get_linfo, Ref{Core.MethodInstance}, + (Any, Any, Any, UInt), meth, ti, env, world) + + to == :julia && return linfo ## LLVM IR - ir, entry = irgen(ctx) + ir, entry = irgen(ctx, linfo) need_library(lib) = any(f -> isdeclaration(f) && intrinsic_id(f) == 0 && diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl index 8cefe6d9..cea12d3f 100644 --- a/src/compiler/irgen.jl +++ b/src/compiler/irgen.jl @@ -1,4 +1,4 @@ -# Julia/LLVM IR generation and transformation passes +# LLVM IR generation function module_setup(mod::LLVM.Module) triple!(mod, Int === Int64 ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda") @@ -97,22 +97,7 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world) return llvmf, modules end -function irgen(ctx::CompilerContext) - # get the method instance - isa(ctx.f, Core.Builtin) && throw(KernelError(ctx, "function is not a generic function")) - world = typemax(UInt) - meth = which(ctx.f, ctx.tt) - sig = Base.signature_type(ctx.f, ctx.tt)::Type - (ti, env) = ccall(:jl_type_intersection_with_env, Any, - (Any, Any), sig, meth.sig)::Core.SimpleVector - if VERSION >= v"1.2.0-DEV.320" - meth = Base.func_for_method_checked(meth, ti, env) - else - meth = Base.func_for_method_checked(meth, ti) - end - linfo = ccall(:jl_specializations_get_linfo, Ref{Core.MethodInstance}, - (Any, Any, Any, UInt), meth, ti, env, world) - +function irgen(ctx::CompilerContext, linfo, world=typemax(UInt)) entry, modules = compile_linfo(ctx, linfo, world) # link in dependent modules diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl index c69cacba..5fedbd79 100644 --- a/src/compiler/validation.jl +++ b/src/compiler/validation.jl @@ -1,6 +1,8 @@ # validation of properties and code function check_method(ctx::CompilerContext) + isa(ctx.f, Core.Builtin) && throw(KernelError(ctx, "function is not a generic function")) + # get the method ms = Base.methods(ctx.f, ctx.tt) isempty(ms) && throw(KernelError(ctx, "no method found")) From 2126b7400eb970e91a546de40212b1d93a786ca7 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Mar 2019 09:17:58 +0100 Subject: [PATCH 09/34] NFC changes. --- src/compiler/driver.jl | 12 ++++++------ src/compiler/irgen.jl | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 75a2514e..b38b14a8 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -62,7 +62,7 @@ function compile(to::Symbol, ctx::CompilerContext; ## LLVM IR - ir, entry = irgen(ctx, linfo) + ir, entry = irgen(ctx, linfo, world) need_library(lib) = any(f -> isdeclaration(f) && intrinsic_id(f) == 0 && @@ -94,13 +94,13 @@ function compile(to::Symbol, ctx::CompilerContext; ## dynamic parallelism if haskey(functions(ir), "cudanativeLaunchDevice") - dyn_calls = [] + f = functions(ir)["cudanativeLaunchDevice"] # find dynamic kernel invocations - f = functions(ir)["cudanativeLaunchDevice"] + # TODO: recover this information earlier, from the Julia IR + worklist = [] for use in uses(f) # decode the call - # FIXME: recover this earlier, from the Julia IR call = user(use)::LLVM.CallInst ops = collect(operands(call))[1:2] ## addrspacecast @@ -113,11 +113,11 @@ function compile(to::Symbol, ctx::CompilerContext; ops = Ptr{Any}.(ops) dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops) - push!(dyn_calls, (call, dyn_f, dyn_tt)) + push!(worklist, (call, dyn_f, dyn_tt)) end # compile and link - for (call, dyn_f, dyn_tt) in dyn_calls + for (call, dyn_f, dyn_tt) in worklist dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true) dyn_ir, dyn_entry = compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip) diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl index cea12d3f..e9677092 100644 --- a/src/compiler/irgen.jl +++ b/src/compiler/irgen.jl @@ -97,7 +97,7 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world) return llvmf, modules end -function irgen(ctx::CompilerContext, linfo, world=typemax(UInt)) +function irgen(ctx::CompilerContext, linfo::Core.MethodInstance, world) entry, modules = compile_linfo(ctx, linfo, world) # link in dependent modules From f1dca4b92ee88a188d3b1209412862b3559306c8 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Mar 2019 10:44:52 +0100 Subject: [PATCH 10/34] Initial use of libcudadevrt for dynamic parallelism. --- src/compiler/driver.jl | 9 ++++++--- src/compiler/validation.jl | 10 ++++++++-- src/execution.jl | 31 +++++++++++++++++++++++++------ 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index b38b14a8..8ef39bd9 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -93,8 +93,8 @@ function compile(to::Symbol, ctx::CompilerContext; ## dynamic parallelism - if haskey(functions(ir), "cudanativeLaunchDevice") - f = functions(ir)["cudanativeLaunchDevice"] + if haskey(functions(ir), "cudanativeCompileKernel") + f = functions(ir)["cudanativeCompileKernel"] # find dynamic kernel invocations # TODO: recover this information earlier, from the Julia IR @@ -122,9 +122,12 @@ function compile(to::Symbol, ctx::CompilerContext; dyn_ir, dyn_entry = compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip) + dyn_fn = LLVM.name(dyn_entry) link!(ir, dyn_ir) + dyn_ir = nothing + dyn_entry = functions(ir)[dyn_fn] - # TODO + replace_uses!(call, dyn_entry) unsafe_delete!(LLVM.parent(call), call) end diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl index 5fedbd79..1c225f44 100644 --- a/src/compiler/validation.jl +++ b/src/compiler/validation.jl @@ -100,8 +100,14 @@ function check_ir!(ctx, errors::Vector{IRError}, f::LLVM.Function) return errors end -const special_fns = ("vprintf", "__assertfail", "malloc", "free", "__nvvm_reflect", - #=device_rt=# "cudaDeviceSynchronize") +const special_fns = ( + # PTX intrinsics + "vprintf", "__assertfail", "malloc", "free", + # libdevice + "__nvvm_reflect", + # libcudevrt + "cudaDeviceSynchronize", "cudaGetParameterBufferV2", "cudaLaunchDeviceV2" +) const libjulia = Ref{Ptr{Cvoid}}(C_NULL) diff --git a/src/execution.jl b/src/execution.jl index 0f96e355..f9ebc4ac 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -61,7 +61,8 @@ end ## helper functions -# split keyword arguments to `@cuda` into ones affecting the compiler, or the execution +# split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and +# the code it generates, or the execution function split_kwargs(kwargs) macro_kws = [:dynamic] compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs] @@ -220,11 +221,12 @@ macro cuda(ex...) # WIP # TODO: GC.@preserve? # TODO: error on, or support kwargs - kernel_args = var_exprs # already in kernel land, so don't need a conversion push!(code.args, quote + # we're in kernel land already, so no need to convert arguments local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)} - dynamic_cufunction($(esc(f)), kernel_tt) + local kernel = dynamic_cufunction($(esc(f)), kernel_tt) + dynamic_launch(kernel, 1, 1, 0, C_NULL) end) else # regular, host-side kernel launch @@ -236,7 +238,8 @@ macro cuda(ex...) GC.@preserve $(vars...) begin local kernel_args = cudaconvert.(($(var_exprs...),)) local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} - local kernel = cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) + local kernel = cufunction($(esc(f)), kernel_tt; + $(map(esc, compiler_kwargs)...)) kernel(kernel_args...; $(map(esc, call_kwargs)...)) end end) @@ -245,6 +248,23 @@ macro cuda(ex...) return code end +import CUDAdrv: CuDim3 + +const cudaError_t = Cint +const cudaStream_t = Ptr{Cvoid} + +@inline function dynamic_launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim, shmem::Int, stream::Ptr{Cvoid}) + blocks = CuDim3(blocks) + threads = CuDim3(threads) + buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid}, + (Ptr{Cvoid}, CuDim3, CuDim3, Cuint), + f, blocks, threads, shmem) + + ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t, + (Ptr{Cvoid}, cudaStream_t), + buf, stream) +end + @generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) tt = Base.to_tuple_type(tt.parameters[1]) sig = Base.signature_type(f, tt) @@ -253,8 +273,7 @@ end quote # drop the f and tt into the module, and recover them later during compilation - ccall("extern cudanativeLaunchDevice", llvmcall, Nothing, (Any, Any), f, tt) - nothing + ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) end end From 6646a4b559281ba5358d099cafdd4d87fb83f304 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Mar 2019 14:07:11 +0100 Subject: [PATCH 11/34] Use simpler dynamic cufunction construction. --- src/execution.jl | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index f9ebc4ac..f83cd6b9 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -253,6 +253,9 @@ import CUDAdrv: CuDim3 const cudaError_t = Cint const cudaStream_t = Ptr{Cvoid} +dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) = + ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) + @inline function dynamic_launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim, shmem::Int, stream::Ptr{Cvoid}) blocks = CuDim3(blocks) threads = CuDim3(threads) @@ -265,18 +268,6 @@ const cudaStream_t = Ptr{Cvoid} buf, stream) end -@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) - tt = Base.to_tuple_type(tt.parameters[1]) - sig = Base.signature_type(f, tt) - t = Tuple(tt.parameters) - # TODO: closures - - quote - # drop the f and tt into the module, and recover them later during compilation - ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) - end -end - ## APIs for manual compilation From 3bb7eeaedcb25919794dedee38fa5fd2b89a15f2 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Mar 2019 14:07:59 +0100 Subject: [PATCH 12/34] Support call kwargs. --- src/execution.jl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index f83cd6b9..118eb0c6 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -226,7 +226,7 @@ macro cuda(ex...) # we're in kernel land already, so no need to convert arguments local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)} local kernel = dynamic_cufunction($(esc(f)), kernel_tt) - dynamic_launch(kernel, 1, 1, 0, C_NULL) + dynamic_launch(kernel, $(var_exprs...); $(map(esc, call_kwargs)...)) end) else # regular, host-side kernel launch @@ -248,21 +248,25 @@ macro cuda(ex...) return code end -import CUDAdrv: CuDim3 +import CUDAdrv: CuDim3, CuStream_t const cudaError_t = Cint -const cudaStream_t = Ptr{Cvoid} +const cudaStream_t = CUDAdrv.CuStream_t dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) -@inline function dynamic_launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim, shmem::Int, stream::Ptr{Cvoid}) +@inline function dynamic_launch(f::Ptr{Cvoid}, values::Vararg{Any,N}; + blocks::CuDim=1, threads::CuDim=1, shmem::Int=0, + stream::CuStream=CuDefaultStream()) where N blocks = CuDim3(blocks) threads = CuDim3(threads) buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid}, (Ptr{Cvoid}, CuDim3, CuDim3, Cuint), f, blocks, threads, shmem) + # TODO: store into buf + ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t, (Ptr{Cvoid}, cudaStream_t), buf, stream) From d94e96de87f42fd616fd340ea7c22bb62e1672cb Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Mar 2019 15:29:41 +0100 Subject: [PATCH 13/34] Initial support for arguments. --- src/execution.jl | 72 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index 118eb0c6..8259d087 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -226,7 +226,7 @@ macro cuda(ex...) # we're in kernel land already, so no need to convert arguments local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)} local kernel = dynamic_cufunction($(esc(f)), kernel_tt) - dynamic_launch(kernel, $(var_exprs...); $(map(esc, call_kwargs)...)) + dynamic_cudacall(kernel, kernel_tt, $(var_exprs...); $(map(esc, call_kwargs)...)) end) else # regular, host-side kernel launch @@ -256,20 +256,76 @@ const cudaStream_t = CUDAdrv.CuStream_t dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) -@inline function dynamic_launch(f::Ptr{Cvoid}, values::Vararg{Any,N}; - blocks::CuDim=1, threads::CuDim=1, shmem::Int=0, - stream::CuStream=CuDefaultStream()) where N +@generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...; + blocks::CuDim=1, threads::CuDim=1, shmem::Integer=0, + stream::CuStream=CuDefaultStream()) + ex = quote + Base.@_inline_meta + end + + # convert the argument values to match the kernel's signature (specified by the user) + # (this mimics `lower-ccall` in julia-syntax.scm) + converted_args = Vector{Symbol}(undef, length(args)) + arg_ptrs = Vector{Symbol}(undef, length(args)) + for i in 1:length(args) + converted_args[i] = gensym() + arg_ptrs[i] = gensym() + push!(ex.args, :($(converted_args[i]) = Base.cconvert($(args[i]), args[$i]))) + push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(args[i]), $(converted_args[i])))) + end + + append!(ex.args, (quote + #GC.@preserve $(converted_args...) begin + launch(f, blocks, threads, shmem, stream, ($(arg_ptrs...),)) + #end + end).args) + + return ex +end + +@inline function launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim, + shmem::Int, stream::CuStream, + args...) blocks = CuDim3(blocks) threads = CuDim3(threads) - buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid}, - (Ptr{Cvoid}, CuDim3, CuDim3, Cuint), - f, blocks, threads, shmem) - # TODO: store into buf + buf = parameter_buffer(f, blocks, threads, shmem, args...) ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t, (Ptr{Cvoid}, cudaStream_t), buf, stream) + + return +end + +@generated function parameter_buffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, + shmem::Int, args...) + # allocate a buffer + ex = quote + buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid}, + (Ptr{Cvoid}, CuDim3, CuDim3, Cuint), + f, blocks, threads, shmem) + end + + # store the parameters + # + # > Each individual parameter placed in the parameter buffer is required to be aligned. + # > That is, each parameter must be placed at the n-th byte in the parameter buffer, + # > where n is the smallest multiple of the parameter size that is greater than the + # > offset of the last byte taken by the preceding parameter. The maximum size of the + # > parameter buffer is 4KB. + offset = 0 + for i in 1:length(args) + buf_index = Base.ceil(Int, offset / sizeof(args[i])) + 1 + offset = buf_index * sizeof(args[i]) + push!(ex.args, :( + unsafe_store!(Base.unsafe_convert(Ptr{$(args[i])}, buf), args[$i], $buf_index) + )) + end + + push!(ex.args, :(return buf)) + + return ex end From cd295380d212c83d8d9f4f48e5cf3397c24963e1 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Mar 2019 15:42:57 +0100 Subject: [PATCH 14/34] Move code around. --- src/device/cuda/libcudadevrt.jl | 51 ++++++++++++++ src/execution.jl | 117 +++++++++----------------------- 2 files changed, 84 insertions(+), 84 deletions(-) diff --git a/src/device/cuda/libcudadevrt.jl b/src/device/cuda/libcudadevrt.jl index 39faf3b2..59f7c969 100644 --- a/src/device/cuda/libcudadevrt.jl +++ b/src/device/cuda/libcudadevrt.jl @@ -4,6 +4,57 @@ # the CUDA API for execution on the device, such as device synchronization primitives, # dynamic kernel APIs, etc. +import CUDAdrv: CuDim3, CuStream_t + +const cudaError_t = Cint +const cudaStream_t = CUDAdrv.CuStream_t + +# device-side counterpart of CUDAdrv.launch +@inline function launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim, + shmem::Int, stream::CuStream, + args...) + blocks = CuDim3(blocks) + threads = CuDim3(threads) + + buf = parameter_buffer(f, blocks, threads, shmem, args...) + + ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t, + (Ptr{Cvoid}, cudaStream_t), + buf, stream) + + return +end + +@generated function parameter_buffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, + shmem::Int, args...) + # allocate a buffer + ex = quote + buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid}, + (Ptr{Cvoid}, CuDim3, CuDim3, Cuint), + f, blocks, threads, shmem) + end + + # store the parameters + # + # > Each individual parameter placed in the parameter buffer is required to be aligned. + # > That is, each parameter must be placed at the n-th byte in the parameter buffer, + # > where n is the smallest multiple of the parameter size that is greater than the + # > offset of the last byte taken by the preceding parameter. The maximum size of the + # > parameter buffer is 4KB. + offset = 0 + for i in 1:length(args) + buf_index = Base.ceil(Int, offset / sizeof(args[i])) + 1 + offset = buf_index * sizeof(args[i]) + push!(ex.args, :( + unsafe_store!(Base.unsafe_convert(Ptr{$(args[i])}, buf), args[$i], $buf_index) + )) + end + + push!(ex.args, :(return buf)) + + return ex +end + """ synchronize() diff --git a/src/execution.jl b/src/execution.jl index 8259d087..c16411bc 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -217,10 +217,6 @@ macro cuda(ex...) if dynamic # dynamic, device-side kernel launch - # - # WIP - # TODO: GC.@preserve? - # TODO: error on, or support kwargs push!(code.args, quote # we're in kernel land already, so no need to convert arguments @@ -248,86 +244,6 @@ macro cuda(ex...) return code end -import CUDAdrv: CuDim3, CuStream_t - -const cudaError_t = Cint -const cudaStream_t = CUDAdrv.CuStream_t - -dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) = - ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) - -@generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...; - blocks::CuDim=1, threads::CuDim=1, shmem::Integer=0, - stream::CuStream=CuDefaultStream()) - ex = quote - Base.@_inline_meta - end - - # convert the argument values to match the kernel's signature (specified by the user) - # (this mimics `lower-ccall` in julia-syntax.scm) - converted_args = Vector{Symbol}(undef, length(args)) - arg_ptrs = Vector{Symbol}(undef, length(args)) - for i in 1:length(args) - converted_args[i] = gensym() - arg_ptrs[i] = gensym() - push!(ex.args, :($(converted_args[i]) = Base.cconvert($(args[i]), args[$i]))) - push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(args[i]), $(converted_args[i])))) - end - - append!(ex.args, (quote - #GC.@preserve $(converted_args...) begin - launch(f, blocks, threads, shmem, stream, ($(arg_ptrs...),)) - #end - end).args) - - return ex -end - -@inline function launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim, - shmem::Int, stream::CuStream, - args...) - blocks = CuDim3(blocks) - threads = CuDim3(threads) - - buf = parameter_buffer(f, blocks, threads, shmem, args...) - - ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t, - (Ptr{Cvoid}, cudaStream_t), - buf, stream) - - return -end - -@generated function parameter_buffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, - shmem::Int, args...) - # allocate a buffer - ex = quote - buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid}, - (Ptr{Cvoid}, CuDim3, CuDim3, Cuint), - f, blocks, threads, shmem) - end - - # store the parameters - # - # > Each individual parameter placed in the parameter buffer is required to be aligned. - # > That is, each parameter must be placed at the n-th byte in the parameter buffer, - # > where n is the smallest multiple of the parameter size that is greater than the - # > offset of the last byte taken by the preceding parameter. The maximum size of the - # > parameter buffer is 4KB. - offset = 0 - for i in 1:length(args) - buf_index = Base.ceil(Int, offset / sizeof(args[i])) + 1 - offset = buf_index * sizeof(args[i]) - push!(ex.args, :( - unsafe_store!(Base.unsafe_convert(Ptr{$(args[i])}, buf), args[$i], $buf_index) - )) - end - - push!(ex.args, :(return buf)) - - return ex -end - ## APIs for manual compilation @@ -445,6 +361,39 @@ The following keyword arguments are supported: Kernel +## dynamic parallelism + +dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) = + ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) + +@generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...; + blocks::CuDim=1, threads::CuDim=1, shmem::Integer=0, + stream::CuStream=CuDefaultStream()) + ex = quote + Base.@_inline_meta + end + + # convert the argument values to match the kernel's signature (specified by the user) + # (this mimics `lower-ccall` in julia-syntax.scm) + converted_args = Vector{Symbol}(undef, length(args)) + arg_ptrs = Vector{Symbol}(undef, length(args)) + for i in 1:length(args) + converted_args[i] = gensym() + arg_ptrs[i] = gensym() + push!(ex.args, :($(converted_args[i]) = Base.cconvert($(args[i]), args[$i]))) + push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(args[i]), $(converted_args[i])))) + end + + append!(ex.args, (quote + #GC.@preserve $(converted_args...) begin + launch(f, blocks, threads, shmem, stream, ($(arg_ptrs...),)) + #end + end).args) + + return ex +end + + ## other """ From e9de1bb2a37b8b308f3d5b77877d901cf496d827 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Mar 2019 15:56:06 +0100 Subject: [PATCH 15/34] Introduce a DynamicKernel object. --- src/execution.jl | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index c16411bc..5feafa5d 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -219,10 +219,10 @@ macro cuda(ex...) # dynamic, device-side kernel launch push!(code.args, quote - # we're in kernel land already, so no need to convert arguments + # we're in kernel land already, so no need to cudaconvert arguments local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)} local kernel = dynamic_cufunction($(esc(f)), kernel_tt) - dynamic_cudacall(kernel, kernel_tt, $(var_exprs...); $(map(esc, call_kwargs)...)) + kernel($(var_exprs...); $(map(esc, call_kwargs)...)) end) else # regular, host-side kernel launch @@ -363,8 +363,27 @@ Kernel ## dynamic parallelism -dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) = - ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) +struct DynamicKernel{F,TT} + fun::Ptr{Cvoid} +end + +function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) + # we can't compile here, so drop a marker which will get picked up during compilation + fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) + DynamicKernel{f,tt}(fptr) +end + +@generated function (kernel::DynamicKernel{F,TT})(args...; call_kwargs...) where {F,TT} + # TODO + call_args = :(args) + call_tt = TT + + quote + Base.@_inline_meta + + dynamic_cudacall(kernel.fun, $call_tt, $call_args...; call_kwargs...) + end +end @generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...; blocks::CuDim=1, threads::CuDim=1, shmem::Integer=0, From 4c1b12d94c711e57d6d85e7b4c95915ef56a6fee Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Mar 2019 16:14:26 +0100 Subject: [PATCH 16/34] Ghost argument filtering, etc. --- src/execution.jl | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index 5feafa5d..e62e10c6 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -373,34 +373,54 @@ function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) DynamicKernel{f,tt}(fptr) end +# FIXME: duplication with (::Kernel)(...) @generated function (kernel::DynamicKernel{F,TT})(args...; call_kwargs...) where {F,TT} - # TODO - call_args = :(args) - call_tt = TT + sig = Base.signature_type(F, TT) + args = (:F, (:( args[$i] ) for i in 1:length(args))...) + + # filter out ghost arguments that shouldn't be passed + to_pass = map(!isghosttype, sig.parameters) + call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] + call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]] + + # replace non-isbits arguments (they should be unused, or compilation would have failed) + # alternatively, make CUDAdrv allow `launch` with non-isbits arguments. + for (i,dt) in enumerate(call_t) + if !isbitstype(dt) + call_t[i] = Ptr{Any} + call_args[i] = :C_NULL + end + end + + # finalize types + call_tt = Base.to_tuple_type(call_t) quote Base.@_inline_meta - dynamic_cudacall(kernel.fun, $call_tt, $call_args...; call_kwargs...) + dynamic_cudacall(kernel.fun, $call_tt, $(call_args...); call_kwargs...) end end +# FIXME: duplication with CUDAdrv.cudacall @generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...; blocks::CuDim=1, threads::CuDim=1, shmem::Integer=0, stream::CuStream=CuDefaultStream()) + types = tt.parameters[1].parameters # the type of `tt` is Type{Tuple{<:DataType...}} + ex = quote Base.@_inline_meta end # convert the argument values to match the kernel's signature (specified by the user) # (this mimics `lower-ccall` in julia-syntax.scm) - converted_args = Vector{Symbol}(undef, length(args)) - arg_ptrs = Vector{Symbol}(undef, length(args)) - for i in 1:length(args) + converted_args = Vector{Symbol}(undef, length(types)) + arg_ptrs = Vector{Symbol}(undef, length(types)) + for i in 1:length(types) converted_args[i] = gensym() arg_ptrs[i] = gensym() - push!(ex.args, :($(converted_args[i]) = Base.cconvert($(args[i]), args[$i]))) - push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(args[i]), $(converted_args[i])))) + push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i]))) + push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(types[i]), $(converted_args[i])))) end append!(ex.args, (quote From cc0aac65c21f0443f4432e669df3dd60a51ba61b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 20 Mar 2019 16:27:28 +0100 Subject: [PATCH 17/34] Start a Kernel type hierarchy. --- src/execution.jl | 54 +++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index e62e10c6..fff604bb 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -3,21 +3,23 @@ export @cuda, cudaconvert, cufunction, nearest_warpsize -## kernel object and query functions +## host-side kernels -struct Kernel{F,TT} +abstract type AbstractKernel{F,TT} end + +struct HostKernel{F,TT} <: AbstractKernel{F,TT} ctx::CuContext mod::CuModule fun::CuFunction end """ - version(k::Kernel) + version(k::HostKernel) Queries the PTX and SM versions a kernel was compiled for. Returns a named tuple. """ -function version(k::Kernel) +function version(k::HostKernel) attr = attributes(k.fun) binary_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_BINARY_VERSION],10)...) ptx_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_PTX_VERSION],10)...) @@ -25,12 +27,12 @@ function version(k::Kernel) end """ - memory(k::Kernel) + memory(k::HostKernel) Queries the local, shared and constant memory usage of a compiled kernel in bytes. Returns a named tuple. """ -function memory(k::Kernel) +function memory(k::HostKernel) attr = attributes(k.fun) local_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES] shared_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_SHARED_SIZE_BYTES] @@ -39,21 +41,21 @@ function memory(k::Kernel) end """ - registers(k::Kernel) + registers(k::HostKernel) Queries the register usage of a kernel. """ -function registers(k::Kernel) +function registers(k::HostKernel) attr = attributes(k.fun) return attr[CUDAdrv.FUNC_ATTRIBUTE_NUM_REGS] end """ - maxthreads(k::Kernel) + maxthreads(k::HostKernel) Queries the maximum amount of threads a kernel can use in a single block. """ -function maxthreads(k::Kernel) +function maxthreads(k::HostKernel) attr = attributes(k.fun) return attr[CUDAdrv.FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK] end @@ -171,7 +173,7 @@ performed, scheduling a kernel launch on the current CUDA context. Several keyword arguments are supported that influence the behavior of `@cuda`. - `dynamic`: use dynamic parallelism to launch device-side kernels - arguments that influence kernel compilation: see [`cufunction`](@ref) -- arguments that influence kernel execution: see [`CUDAnative.Kernel`](@ref) +- arguments that influence kernel execution: see [`CUDAnative.HostKernel`](@ref) The underlying operations (argument conversion, kernel compilation, kernel call) can be performed explicitly when more control is needed, e.g. to reflect on the resource usage of a @@ -245,10 +247,10 @@ macro cuda(ex...) end -## APIs for manual compilation +## host-side launch API const agecache = Dict{UInt, UInt}() -const compilecache = Dict{UInt, Kernel}() +const compilecache = Dict{UInt, HostKernel}() """ cufunction(f, tt=Tuple{}; kwargs...) @@ -301,7 +303,7 @@ when function changes, or when different types or keyword arguments are provided dev = device(ctx) cap = supported_capability(dev) fun, mod = compile(:cuda, cap, f, tt; kwargs...) - kernel = Kernel{f,tt}(ctx, mod, fun) + kernel = HostKernel{f,tt}(ctx, mod, fun) @debug begin ver = version(kernel) mem = memory(kernel) @@ -312,11 +314,11 @@ when function changes, or when different types or keyword arguments are provided compilecache[key] = kernel end - return compilecache[key]::Kernel{f,tt} + return compilecache[key]::HostKernel{f,tt} end end -@generated function (kernel::Kernel{F,TT})(args...; call_kwargs...) where {F,TT} +@generated function (kernel::HostKernel{F,TT})(args...; call_kwargs...) where {F,TT} sig = Base.signature_type(F, TT) args = (:F, (:( args[$i] ) for i in 1:length(args))...) @@ -347,7 +349,7 @@ end # FIXME: there doesn't seem to be a way to access the documentation for the call-syntax, # so attach it to the type """ - (::Kernel)(args...; kwargs...) + (::HostKernel)(args...; kwargs...) Low-level interface to call a compiled kernel, passing GPU-compatible arguments in `args`. For a higher-level interface, use [`@cuda`](@ref). @@ -358,23 +360,23 @@ The following keyword arguments are supported: - `shmem` (defaults to 0) - `stream` (defaults to the default stream) """ -Kernel +HostKernel -## dynamic parallelism +## device-side kernels and launch API (aka. dynamic parallelism) -struct DynamicKernel{F,TT} +struct DeviceKernel{F,TT} <: AbstractKernel{F,TT} fun::Ptr{Cvoid} end function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) # we can't compile here, so drop a marker which will get picked up during compilation fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) - DynamicKernel{f,tt}(fptr) + DeviceKernel{f,tt}(fptr) end -# FIXME: duplication with (::Kernel)(...) -@generated function (kernel::DynamicKernel{F,TT})(args...; call_kwargs...) where {F,TT} +# FIXME: duplication with (::HostKernel)(...) +@generated function (kernel::DeviceKernel{F,TT})(args...; call_kwargs...) where {F,TT} sig = Base.signature_type(F, TT) args = (:F, (:( args[$i] ) for i in 1:length(args))...) @@ -414,9 +416,9 @@ end # convert the argument values to match the kernel's signature (specified by the user) # (this mimics `lower-ccall` in julia-syntax.scm) - converted_args = Vector{Symbol}(undef, length(types)) - arg_ptrs = Vector{Symbol}(undef, length(types)) - for i in 1:length(types) + converted_args = Vector{Symbol}(undef, length(args)) + arg_ptrs = Vector{Symbol}(undef, length(args)) + for i in 1:length(args) converted_args[i] = gensym() arg_ptrs[i] = gensym() push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i]))) From 8645bb08a6a3dc4ae1ba9e536cc9c63cb093114a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Mar 2019 08:47:03 +0100 Subject: [PATCH 18/34] Merge identical dynamic kernel invocations. --- src/compiler/driver.jl | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 8ef39bd9..1e280fa4 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -98,7 +98,7 @@ function compile(to::Symbol, ctx::CompilerContext; # find dynamic kernel invocations # TODO: recover this information earlier, from the Julia IR - worklist = [] + worklist = Dict{Tuple{Core.Function,Type}, Vector{LLVM.CallInst}}() for use in uses(f) # decode the call call = user(use)::LLVM.CallInst @@ -113,11 +113,12 @@ function compile(to::Symbol, ctx::CompilerContext; ops = Ptr{Any}.(ops) dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops) - push!(worklist, (call, dyn_f, dyn_tt)) + calls = get!(worklist, (dyn_f, dyn_tt), LLVM.CallInst[]) + push!(calls, call) end # compile and link - for (call, dyn_f, dyn_tt) in worklist + for (dyn_f, dyn_tt) in keys(worklist) dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true) dyn_ir, dyn_entry = compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip) @@ -127,8 +128,11 @@ function compile(to::Symbol, ctx::CompilerContext; dyn_ir = nothing dyn_entry = functions(ir)[dyn_fn] - replace_uses!(call, dyn_entry) - unsafe_delete!(LLVM.parent(call), call) + # insert a call everywhere the kernel is used + for call in worklist[(dyn_f,dyn_tt)] + replace_uses!(call, dyn_entry) + unsafe_delete!(LLVM.parent(call), call) + end end @compiler_assert isempty(uses(f)) ctx From 695fda4d123bc4236c01503645650c36353e4c46 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Mar 2019 09:01:19 +0100 Subject: [PATCH 19/34] Detect closures. --- src/CUDAnative.jl | 1 + src/device/llvm.jl | 3 +++ src/execution.jl | 19 +++++++++++++++---- 3 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 src/device/llvm.jl diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index cec0b9cf..018729ae 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -27,6 +27,7 @@ include(joinpath("device", "tools.jl")) include(joinpath("device", "pointer.jl")) include(joinpath("device", "array.jl")) include(joinpath("device", "cuda.jl")) +include(joinpath("device", "llvm.jl")) include(joinpath("device", "runtime.jl")) include("compiler.jl") diff --git a/src/device/llvm.jl b/src/device/llvm.jl new file mode 100644 index 00000000..ddf7364d --- /dev/null +++ b/src/device/llvm.jl @@ -0,0 +1,3 @@ +# wrappers for LLVM-specific functionality + +@inline trap() = ccall("llvm.trap", llvmcall, Cvoid, ()) diff --git a/src/execution.jl b/src/execution.jl index fff604bb..54882cf4 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -369,10 +369,21 @@ struct DeviceKernel{F,TT} <: AbstractKernel{F,TT} fun::Ptr{Cvoid} end -function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) - # we can't compile here, so drop a marker which will get picked up during compilation - fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) - DeviceKernel{f,tt}(fptr) +@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) + if sizeof(f) > 0 + Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures") + quote + trap() + DeviceKernel{f,tt}(C_NULL) + end + else + # we can't compile here, so drop a marker which will get picked up during compilation + quote + fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, + (Any, Any), f, tt) + DeviceKernel{f,tt}(fptr) + end + end end # FIXME: duplication with (::HostKernel)(...) From 7bd4628add0bde44b5055589c3503735d5a8a4c4 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Mar 2019 10:20:19 +0100 Subject: [PATCH 20/34] Avoid duplication by attaching call to AbstractKernel. --- src/execution.jl | 343 +++++++++++++++++++++++------------------------ 1 file changed, 168 insertions(+), 175 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index 54882cf4..20ffb2c9 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -3,64 +3,6 @@ export @cuda, cudaconvert, cufunction, nearest_warpsize -## host-side kernels - -abstract type AbstractKernel{F,TT} end - -struct HostKernel{F,TT} <: AbstractKernel{F,TT} - ctx::CuContext - mod::CuModule - fun::CuFunction -end - -""" - version(k::HostKernel) - -Queries the PTX and SM versions a kernel was compiled for. -Returns a named tuple. -""" -function version(k::HostKernel) - attr = attributes(k.fun) - binary_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_BINARY_VERSION],10)...) - ptx_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_PTX_VERSION],10)...) - return (ptx=ptx_ver, binary=binary_ver) -end - -""" - memory(k::HostKernel) - -Queries the local, shared and constant memory usage of a compiled kernel in bytes. -Returns a named tuple. -""" -function memory(k::HostKernel) - attr = attributes(k.fun) - local_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES] - shared_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_SHARED_SIZE_BYTES] - constant_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_CONST_SIZE_BYTES] - return (:local=>local_mem, shared=shared_mem, constant=constant_mem) -end - -""" - registers(k::HostKernel) - -Queries the register usage of a kernel. -""" -function registers(k::HostKernel) - attr = attributes(k.fun) - return attr[CUDAdrv.FUNC_ATTRIBUTE_NUM_REGS] -end - -""" - maxthreads(k::HostKernel) - -Queries the maximum amount of threads a kernel can use in a single block. -""" -function maxthreads(k::HostKernel) - attr = attributes(k.fun) - return attr[CUDAdrv.FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK] -end - - ## helper functions # split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and @@ -131,34 +73,6 @@ function method_age(f, tt)::UInt end -## adaptors - -struct Adaptor end - -# convert CUDAdrv pointers to CUDAnative pointers -Adapt.adapt_storage(to::Adaptor, p::CuPtr{T}) where {T} = DevicePtr{T,AS.Generic}(p) - -# Base.RefValue isn't GPU compatible, so provide a compatible alternative -struct CuRefValue{T} <: Ref{T} - x::T -end -Base.getindex(r::CuRefValue) = r.x -Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[])) - -# convenience function -""" - cudaconvert(x) - -This function is called for every argument to be passed to a kernel, allowing it to be -converted to a GPU-friendly format. By default, the function does nothing and returns the -input object `x` as-is. - -Do not add methods to this function, but instead extend the underlying Adapt.jl package and -register methods for the the `CUDAnative.Adaptor` type. -""" -cudaconvert(arg) = adapt(Adaptor(), arg) - - ## high-level @cuda interface """ @@ -247,7 +161,145 @@ macro cuda(ex...) end -## host-side launch API +## host to device value conversion + +struct Adaptor end + +# convert CUDAdrv pointers to CUDAnative pointers +Adapt.adapt_storage(to::Adaptor, p::CuPtr{T}) where {T} = DevicePtr{T,AS.Generic}(p) + +# Base.RefValue isn't GPU compatible, so provide a compatible alternative +struct CuRefValue{T} <: Ref{T} + x::T +end +Base.getindex(r::CuRefValue) = r.x +Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[])) + +""" + cudaconvert(x) + +This function is called for every argument to be passed to a kernel, allowing it to be +converted to a GPU-friendly format. By default, the function does nothing and returns the +input object `x` as-is. + +Do not add methods to this function, but instead extend the underlying Adapt.jl package and +register methods for the the `CUDAnative.Adaptor` type. +""" +cudaconvert(arg) = adapt(Adaptor(), arg) + + +## abstract kernel functionality + +abstract type AbstractKernel{F,TT} end + +# FIXME: there doesn't seem to be a way to access the documentation for the call-syntax, +# so attach it to the type +""" + (::HostKernel)(args...; kwargs...) + (::DeviceKernel)(args...; kwargs...) + +Low-level interface to call a compiled kernel, passing GPU-compatible arguments in `args`. +For a higher-level interface, use [`@cuda`](@ref). + +The following keyword arguments are supported: +- `threads` (defaults to 1) +- `blocks` (defaults to 1) +- `shmem` (defaults to 0) +- `stream` (defaults to the default stream) +""" +AbstractKernel + +@generated function call(kernel::AbstractKernel{F,TT}, args...; call_kwargs...) where {F,TT} + sig = Base.signature_type(F, TT) + args = (:F, (:( args[$i] ) for i in 1:length(args))...) + + # filter out ghost arguments that shouldn't be passed + to_pass = map(!isghosttype, sig.parameters) + call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] + call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]] + + # replace non-isbits arguments (they should be unused, or compilation would have failed) + # alternatively, make CUDAdrv allow `launch` with non-isbits arguments. + for (i,dt) in enumerate(call_t) + if !isbitstype(dt) + call_t[i] = Ptr{Any} + call_args[i] = :C_NULL + end + end + + # finalize types + call_tt = Base.to_tuple_type(call_t) + + quote + Base.@_inline_meta + + cudacall(kernel, $call_tt, $(call_args...); call_kwargs...) + end +end + + +## host-side kernels + +struct HostKernel{F,TT} <: AbstractKernel{F,TT} + ctx::CuContext + mod::CuModule + fun::CuFunction +end + +@doc (@doc AbstractKernel) HostKernel + +@inline cudacall(kernel::HostKernel, tt, args...; kwargs...) = + CUDAdrv.cudacall(kernel.fun, tt, args...; kwargs...) + +""" + version(k::HostKernel) + +Queries the PTX and SM versions a kernel was compiled for. +Returns a named tuple. +""" +function version(k::HostKernel) + attr = attributes(k.fun) + binary_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_BINARY_VERSION],10)...) + ptx_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_PTX_VERSION],10)...) + return (ptx=ptx_ver, binary=binary_ver) +end + +""" + memory(k::HostKernel) + +Queries the local, shared and constant memory usage of a compiled kernel in bytes. +Returns a named tuple. +""" +function memory(k::HostKernel) + attr = attributes(k.fun) + local_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES] + shared_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_SHARED_SIZE_BYTES] + constant_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_CONST_SIZE_BYTES] + return (:local=>local_mem, shared=shared_mem, constant=constant_mem) +end + +""" + registers(k::HostKernel) + +Queries the register usage of a kernel. +""" +function registers(k::HostKernel) + attr = attributes(k.fun) + return attr[CUDAdrv.FUNC_ATTRIBUTE_NUM_REGS] +end + +""" + maxthreads(k::HostKernel) + +Queries the maximum amount of threads a kernel can use in a single block. +""" +function maxthreads(k::HostKernel) + attr = attributes(k.fun) + return attr[CUDAdrv.FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK] +end + + +## host-side API const agecache = Dict{UInt, UInt}() const compilecache = Dict{UInt, HostKernel}() @@ -318,102 +370,20 @@ when function changes, or when different types or keyword arguments are provided end end -@generated function (kernel::HostKernel{F,TT})(args...; call_kwargs...) where {F,TT} - sig = Base.signature_type(F, TT) - args = (:F, (:( args[$i] ) for i in 1:length(args))...) +# https://github.com/JuliaLang/julia/issues/14919 +(kernel::HostKernel)(args...; kwargs...) = call(kernel, args...; kwargs...) - # filter out ghost arguments that shouldn't be passed - to_pass = map(!isghosttype, sig.parameters) - call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] - call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]] - # replace non-isbits arguments (they should be unused, or compilation would have failed) - # alternatively, make CUDAdrv allow `launch` with non-isbits arguments. - for (i,dt) in enumerate(call_t) - if !isbitstype(dt) - call_t[i] = Ptr{Any} - call_args[i] = :C_NULL - end - end - - # finalize types - call_tt = Base.to_tuple_type(call_t) - - quote - Base.@_inline_meta - - cudacall(kernel.fun, $call_tt, $(call_args...); call_kwargs...) - end -end - -# FIXME: there doesn't seem to be a way to access the documentation for the call-syntax, -# so attach it to the type -""" - (::HostKernel)(args...; kwargs...) - -Low-level interface to call a compiled kernel, passing GPU-compatible arguments in `args`. -For a higher-level interface, use [`@cuda`](@ref). - -The following keyword arguments are supported: -- `threads` (defaults to 1) -- `blocks` (defaults to 1) -- `shmem` (defaults to 0) -- `stream` (defaults to the default stream) -""" -HostKernel - - -## device-side kernels and launch API (aka. dynamic parallelism) +## device-side kernels struct DeviceKernel{F,TT} <: AbstractKernel{F,TT} fun::Ptr{Cvoid} end -@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) - if sizeof(f) > 0 - Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures") - quote - trap() - DeviceKernel{f,tt}(C_NULL) - end - else - # we can't compile here, so drop a marker which will get picked up during compilation - quote - fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, - (Any, Any), f, tt) - DeviceKernel{f,tt}(fptr) - end - end -end - -# FIXME: duplication with (::HostKernel)(...) -@generated function (kernel::DeviceKernel{F,TT})(args...; call_kwargs...) where {F,TT} - sig = Base.signature_type(F, TT) - args = (:F, (:( args[$i] ) for i in 1:length(args))...) - - # filter out ghost arguments that shouldn't be passed - to_pass = map(!isghosttype, sig.parameters) - call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] - call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]] - - # replace non-isbits arguments (they should be unused, or compilation would have failed) - # alternatively, make CUDAdrv allow `launch` with non-isbits arguments. - for (i,dt) in enumerate(call_t) - if !isbitstype(dt) - call_t[i] = Ptr{Any} - call_args[i] = :C_NULL - end - end - - # finalize types - call_tt = Base.to_tuple_type(call_t) - - quote - Base.@_inline_meta +@doc (@doc AbstractKernel) DeviceKernel - dynamic_cudacall(kernel.fun, $call_tt, $(call_args...); call_kwargs...) - end -end +@inline cudacall(kernel::DeviceKernel, tt, args...; kwargs...) = + dynamic_cudacall(kernel.fun, tt, args...; kwargs...) # FIXME: duplication with CUDAdrv.cudacall @generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...; @@ -446,6 +416,29 @@ end end +## device-side API + +@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) + if sizeof(f) > 0 + Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures") + quote + trap() + DeviceKernel{f,tt}(C_NULL) + end + else + # we can't compile here, so drop a marker which will get picked up during compilation + quote + fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, + (Any, Any), f, tt) + DeviceKernel{f,tt}(fptr) + end + end +end + +# https://github.com/JuliaLang/julia/issues/14919 +(kernel::DeviceKernel)(args...; kwargs...) = call(kernel, args...; kwargs...) + + ## other """ From b74ed8f11df566f9f9dcaf37e9d11119978fb9e4 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Mar 2019 10:38:18 +0100 Subject: [PATCH 21/34] Documentation, error checking, etc. --- src/execution.jl | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index 20ffb2c9..428f586a 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -86,8 +86,10 @@ performed, scheduling a kernel launch on the current CUDA context. Several keyword arguments are supported that influence the behavior of `@cuda`. - `dynamic`: use dynamic parallelism to launch device-side kernels -- arguments that influence kernel compilation: see [`cufunction`](@ref) -- arguments that influence kernel execution: see [`CUDAnative.HostKernel`](@ref) +- arguments that influence kernel compilation: see [`cufunction`](@ref) and + [`dynamic_cufunction`](@ref) +- arguments that influence kernel launch: see [`CUDAnative.HostKernel`](@ref) and + [`CUDAnative.DeviceKernel`](@ref) The underlying operations (argument conversion, kernel compilation, kernel call) can be performed explicitly when more control is needed, e.g. to reflect on the resource usage of a @@ -100,6 +102,15 @@ kernel to determine the launch configuration. A host-side kernel launch is done kernel = cufunction(f, kernel_tt; compilation_kwargs) kernel(kernel_args...; launch_kwargs) end + +A device-side launch, aka. dynamic parallelism, is similar but more restricted: + + args = ... + # GC.@preserve is not supported + # we're on the device already, so no need to cudaconvert + kernel_tt = Tuple{Core.Typeof(args[1]), ...} # this needs to be fully inferred! + kernel = dynamic_cufunction(f, kernel_tt) # no compiler kwargs supported + kernel(args...; launch_kwargs) """ macro cuda(ex...) # destructure the `@cuda` expression @@ -125,6 +136,7 @@ macro cuda(ex...) for kwarg in macro_kwargs key,val = kwarg.args if key == :dynamic + isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant value")) dynamic = val::Bool else throw(ArgumentError("Unsupported keyword argument '$key'")) @@ -132,6 +144,11 @@ macro cuda(ex...) end if dynamic + # FIXME: we could probably somehow support kwargs with constant values by either + # saving them in a global Dict here, or trying to pick them up from the Julia + # IR when processing the dynamic parallelism marker + isempty(compiler_kwargs) || error("@cuda dynamic parallelism does not support compiler keyword arguments") + # dynamic, device-side kernel launch push!(code.args, quote @@ -418,6 +435,14 @@ end ## device-side API +""" + dynamic_cufunction(f, tt=Tuple{}) + +Low-level interface to compile a function invocation for the currently-active GPU, returning +a callable kernel object. Device-side equivalent of [`CUDAnative.cufunction`](@ref). + +No keyword arguments are supported. +""" @generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) if sizeof(f) > 0 Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures") From cd6e2aef4f3948afd598b71c0fade8393d249193 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Mar 2019 11:10:38 +0100 Subject: [PATCH 22/34] Rename CompilerContext to CompilerJob. It's not really a context. Also disambiguates from CUDA's and LLVM's contexts. --- src/compiler/common.jl | 38 +++++++++++----------- src/compiler/driver.jl | 40 +++++++++++------------ src/compiler/irgen.jl | 30 ++++++++--------- src/compiler/mcgen.jl | 12 +++---- src/compiler/optim.jl | 52 +++++++++++++++--------------- src/compiler/rtlib.jl | 6 ++-- src/compiler/validation.jl | 46 +++++++++++++------------- src/reflection.jl | 66 +++++++++++++++++++------------------- 8 files changed, 145 insertions(+), 145 deletions(-) diff --git a/src/compiler/common.jl b/src/compiler/common.jl index f11eee5f..b9160a5f 100644 --- a/src/compiler/common.jl +++ b/src/compiler/common.jl @@ -1,6 +1,6 @@ # common functionality -struct CompilerContext +struct CompilerJob # core invocation f::Core.Function tt::DataType @@ -13,38 +13,38 @@ struct CompilerContext blocks_per_sm::Union{Nothing,Integer} maxregs::Union{Nothing,Integer} - CompilerContext(f, tt, cap, kernel; + CompilerJob(f, tt, cap, kernel; minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing, maxregs=nothing) = new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs) end -# global context reference -# FIXME: thread through `ctx` everywhere (deadlocks the Julia compiler when doing so with +# global job reference +# FIXME: thread through `job` everywhere (deadlocks the Julia compiler when doing so with # the LLVM passes in CUDAnative) -global_ctx = nothing +current_job = nothing -function signature(ctx::CompilerContext) - fn = typeof(ctx.f).name.mt.name - args = join(ctx.tt.parameters, ", ") - return "$fn($(join(ctx.tt.parameters, ", ")))" +function signature(job::CompilerJob) + fn = typeof(job.f).name.mt.name + args = join(job.tt.parameters, ", ") + return "$fn($(join(job.tt.parameters, ", ")))" end struct KernelError <: Exception - ctx::CompilerContext + job::CompilerJob message::String help::Union{Nothing,String} bt::StackTraces.StackTrace - KernelError(ctx::CompilerContext, message::String, help=nothing; + KernelError(job::CompilerJob, message::String, help=nothing; bt=StackTraces.StackTrace()) = - new(ctx, message, help, bt) + new(job, message, help, bt) end function Base.showerror(io::IO, err::KernelError) - println(io, "GPU compilation of $(signature(err.ctx)) failed") + println(io, "GPU compilation of $(signature(err.job)) failed") println(io, "KernelError: $(err.message)") println(io) println(io, something(err.help, "Try inspecting the generated code with any of the @device_code_... macros.")) @@ -53,10 +53,10 @@ end struct InternalCompilerError <: Exception - ctx::CompilerContext + job::CompilerJob message::String meta::Dict - InternalCompilerError(ctx, message; kwargs...) = new(ctx, message, kwargs) + InternalCompilerError(job, message; kwargs...) = new(job, message, kwargs) end function Base.showerror(io::IO, err::InternalCompilerError) @@ -67,8 +67,8 @@ function Base.showerror(io::IO, err::InternalCompilerError) println(io, "\nInternalCompilerError: $(err.message)") println(io, "\nCompiler invocation:") - for field in fieldnames(CompilerContext) - println(io, " - $field = $(repr(getfield(err.ctx, field)))") + for field in fieldnames(CompilerJob) + println(io, " - $field = $(repr(getfield(err.job, field)))") end if !isempty(err.meta) @@ -87,10 +87,10 @@ function Base.showerror(io::IO, err::InternalCompilerError) versioninfo(io) end -macro compiler_assert(ex, ctx, kwargs...) +macro compiler_assert(ex, job, kwargs...) msg = "$ex, at $(__source__.file):$(__source__.line)" return :($(esc(ex)) ? $(nothing) - : throw(InternalCompilerError($(esc(ctx)), $msg; + : throw(InternalCompilerError($(esc(job)), $msg; $(map(esc, kwargs)...))) ) end diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 1e280fa4..9687b050 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -1,6 +1,6 @@ # compiler driver and main interface -# (::CompilerContext) +# (::CompilerJob) const compile_hook = Ref{Union{Nothing,Function}}(nothing) """ @@ -22,18 +22,18 @@ Other keyword arguments can be found in the documentation of [`cufunction`](@ref compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt), kernel::Bool=true; hooks::Bool=true, optimize::Bool=true, strip::Bool=false, kwargs...) = - compile(to, CompilerContext(f, tt, cap, kernel; kwargs...); + compile(to, CompilerJob(f, tt, cap, kernel; kwargs...); hooks=hooks, optimize=optimize, strip=strip) -function compile(to::Symbol, ctx::CompilerContext; +function compile(to::Symbol, job::CompilerJob; hooks::Bool=true, optimize::Bool=true, strip::Bool=false) - @debug "(Re)compiling function" ctx + @debug "(Re)compiling function" job if hooks && compile_hook[] != nothing global globalUnique previous_globalUnique = globalUnique - compile_hook[](ctx) + compile_hook[](job) globalUnique = previous_globalUnique end @@ -41,12 +41,12 @@ function compile(to::Symbol, ctx::CompilerContext; ## Julia IR - check_method(ctx) + check_method(job) # get the method instance world = typemax(UInt) - meth = which(ctx.f, ctx.tt) - sig = Base.signature_type(ctx.f, ctx.tt)::Type + meth = which(job.f, job.tt) + sig = Base.signature_type(job.f, job.tt)::Type (ti, env) = ccall(:jl_type_intersection_with_env, Any, (Any, Any), sig, meth.sig)::Core.SimpleVector if VERSION >= v"1.2.0-DEV.320" @@ -62,26 +62,26 @@ function compile(to::Symbol, ctx::CompilerContext; ## LLVM IR - ir, entry = irgen(ctx, linfo, world) + ir, entry = irgen(job, linfo, world) need_library(lib) = any(f -> isdeclaration(f) && intrinsic_id(f) == 0 && haskey(functions(lib), LLVM.name(f)), functions(ir)) - libdevice = load_libdevice(ctx.cap) + libdevice = load_libdevice(job.cap) if need_library(libdevice) - link_libdevice!(ctx, ir, libdevice) + link_libdevice!(job, ir, libdevice) end # optimize the IR if optimize - entry = optimize!(ctx, ir, entry) + entry = optimize!(job, ir, entry) end - runtime = load_runtime(ctx.cap) + runtime = load_runtime(job.cap) if need_library(runtime) - link_library!(ctx, ir, runtime) + link_library!(job, ir, runtime) end verify(ir) @@ -119,7 +119,7 @@ function compile(to::Symbol, ctx::CompilerContext; # compile and link for (dyn_f, dyn_tt) in keys(worklist) - dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true) + dyn_ctx = CompilerJob(dyn_f, dyn_tt, job.cap, true) dyn_ir, dyn_entry = compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip) @@ -135,7 +135,7 @@ function compile(to::Symbol, ctx::CompilerContext; end end - @compiler_assert isempty(uses(f)) ctx + @compiler_assert isempty(uses(f)) job unsafe_delete!(ir, f) end @@ -144,12 +144,12 @@ function compile(to::Symbol, ctx::CompilerContext; ## PTX machine code - prepare_execution!(ctx, ir) + prepare_execution!(job, ir) - check_invocation(ctx, entry) - check_ir(ctx, ir) + check_invocation(job, entry) + check_ir(job, ir) - asm = mcgen(ctx, ir, entry) + asm = mcgen(job, ir, entry) to == :ptx && return asm, LLVM.name(entry) diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl index e9677092..23361722 100644 --- a/src/compiler/irgen.jl +++ b/src/compiler/irgen.jl @@ -16,7 +16,7 @@ safe_fn(f::Core.Function) = safe_fn(String(typeof(f).name.mt.name)) safe_fn(f::LLVM.Function) = safe_fn(LLVM.name(f)) # generate a pseudo-backtrace from a stack of methods being emitted -function backtrace(ctx::CompilerContext, method_stack::Vector{Core.MethodInstance}) +function backtrace(job::CompilerJob, method_stack::Vector{Core.MethodInstance}) bt = StackTraces.StackFrame[] for method_instance in method_stack method = method_instance.def @@ -34,7 +34,7 @@ end Base.showerror(io::IO, err::MethodSubstitutionWarning) = print(io, "You called $(err.original), maybe you intended to call $(err.substitute) instead?") -function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world) +function compile_linfo(job::CompilerJob, linfo::Core.MethodInstance, world) # set-up the compiler interface function hook_module_setup(ref::Ptr{Cvoid}) ref = convert(LLVM.API.LLVMModuleRef, ref) @@ -51,8 +51,8 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world) # check for recursion if method_instance in method_stack[1:end-1] - throw(KernelError(ctx, "recursion is currently not supported"; - bt=backtrace(ctx, method_stack))) + throw(KernelError(job, "recursion is currently not supported"; + bt=backtrace(job, method_stack))) end # check for Base methods that exist in CUDAnative too @@ -65,13 +65,13 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world) if hasmethod(substitute_function, tt) method′ = which(substitute_function, tt) if Base.moduleroot(method′.module) == CUDAnative - @warn "calls to Base intrinsics might be GPU incompatible" exception=(MethodSubstitutionWarning(method, method′), backtrace(ctx, method_stack)) + @warn "calls to Base intrinsics might be GPU incompatible" exception=(MethodSubstitutionWarning(method, method′), backtrace(job, method_stack)) end end end end function hook_emitted_function(method, code, world) - @compiler_assert last(method_stack) == method ctx + @compiler_assert last(method_stack) == method job pop!(method_stack) end params = Base.CodegenParams(cached = false, @@ -89,7 +89,7 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world) (Any, UInt, Bool, Bool, Base.CodegenParams), linfo, world, #=wrapper=#false, #=optimize=#false, params) if ref == C_NULL - throw(InternalCompilerError(ctx, "the Julia compiler could not generate LLVM IR")) + throw(InternalCompilerError(job, "the Julia compiler could not generate LLVM IR")) end llvmf = LLVM.Function(ref) @@ -97,8 +97,8 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world) return llvmf, modules end -function irgen(ctx::CompilerContext, linfo::Core.MethodInstance, world) - entry, modules = compile_linfo(ctx, linfo, world) +function irgen(job::CompilerJob, linfo::Core.MethodInstance, world) + entry, modules = compile_linfo(job, linfo, world) # link in dependent modules mod = popfirst!(modules) @@ -165,8 +165,8 @@ function irgen(ctx::CompilerContext, linfo::Core.MethodInstance, world) # minimal required optimization ModulePassManager() do pm - global global_ctx - global_ctx = ctx + global current_job + current_job = job add!(pm, ModulePass("LowerThrow", lower_throw!)) add!(pm, FunctionPass("HideUnreachable", hide_unreachable!)) @@ -187,7 +187,7 @@ end # once we have thorough inference (ie. discarding `@nospecialize` and thus supporting # exception arguments) and proper debug info to unwind the stack, this pass can go. function lower_throw!(mod::LLVM.Module) - ctx = global_ctx::CompilerContext + job = current_job::CompilerJob changed = false throw_functions = Dict{String,String}( @@ -241,7 +241,7 @@ function lower_throw!(mod::LLVM.Module) changed = true end - @compiler_assert isempty(uses(f)) ctx + @compiler_assert isempty(uses(f)) job end end @@ -300,7 +300,7 @@ end # only to prevent introducing non-structureness during optimization (ie. the front-end # is still responsible for generating structured control flow). function hide_unreachable!(fun::LLVM.Function) - ctx = global_ctx::CompilerContext + job = current_job::CompilerJob changed = false # remove `noreturn` attributes @@ -413,7 +413,7 @@ end # # if LLVM knows we're trapping, code is marked `unreachable` (see `hide_unreachable!`). function hide_trap!(mod::LLVM.Module) - ctx = global_ctx::CompilerContext + job = current_job::CompilerJob changed = false # inline assembly to exit a thread, hiding control flow from LLVM diff --git a/src/compiler/mcgen.jl b/src/compiler/mcgen.jl index 3d46db44..ef309fe1 100644 --- a/src/compiler/mcgen.jl +++ b/src/compiler/mcgen.jl @@ -21,10 +21,10 @@ end # final preparations for the module to be compiled to PTX # these passes should not be run when e.g. compiling to write to disk. -function prepare_execution!(ctx::CompilerContext, mod::LLVM.Module) +function prepare_execution!(job::CompilerJob, mod::LLVM.Module) let pm = ModulePassManager() - global global_ctx - global_ctx = ctx + global current_job + current_job = job global_optimizer!(pm) @@ -49,7 +49,7 @@ end # # this pass performs that resolution at link time. function resolve_cpu_references!(mod::LLVM.Module) - ctx = global_ctx::CompilerContext + job = current_job::CompilerJob changed = false for f in functions(mod) @@ -85,8 +85,8 @@ function resolve_cpu_references!(mod::LLVM.Module) return changed end -function mcgen(ctx::CompilerContext, mod::LLVM.Module, f::LLVM.Function) - tm = machine(ctx.cap, triple(mod)) +function mcgen(job::CompilerJob, mod::LLVM.Module, f::LLVM.Function) + tm = machine(job.cap, triple(mod)) InitializeNVPTXAsmPrinter() return String(emit(tm, mod, LLVM.API.LLVMAssemblyFile)) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index 7e38e155..f4f3302b 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -1,10 +1,10 @@ # LLVM IR optimization -function optimize!(ctx::CompilerContext, mod::LLVM.Module, entry::LLVM.Function) - tm = machine(ctx.cap, triple(mod)) +function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) + tm = machine(job.cap, triple(mod)) - if ctx.kernel - entry = promote_kernel!(ctx, mod, entry) + if job.kernel + entry = promote_kernel!(job, mod, entry) end function initialize!(pm) @@ -13,8 +13,8 @@ function optimize!(ctx::CompilerContext, mod::LLVM.Module, entry::LLVM.Function) internalize!(pm, [LLVM.name(entry)]) end - global global_ctx - global_ctx = ctx + global current_job + current_job = job # Julia-specific optimizations # @@ -118,8 +118,8 @@ end # promote a function to a kernel # FIXME: sig vs tt (code_llvm vs cufunction) -function promote_kernel!(ctx::CompilerContext, mod::LLVM.Module, entry_f::LLVM.Function) - kernel = wrap_entry!(ctx, mod, entry_f) +function promote_kernel!(job::CompilerJob, mod::LLVM.Module, entry_f::LLVM.Function) + kernel = wrap_entry!(job, mod, entry_f) # property annotations # TODO: belongs in irgen? doesn't maxntidx doesn't appear in ptx code? @@ -130,16 +130,16 @@ function promote_kernel!(ctx::CompilerContext, mod::LLVM.Module, entry_f::LLVM.F append!(annotations, [MDString("kernel"), ConstantInt(Int32(1), JuliaContext())]) ## expected CTA sizes - if ctx.minthreads != nothing - bounds = CUDAdrv.CuDim3(ctx.minthreads) + if job.minthreads != nothing + bounds = CUDAdrv.CuDim3(job.minthreads) for dim in (:x, :y, :z) bound = getfield(bounds, dim) append!(annotations, [MDString("reqntid$dim"), ConstantInt(Int32(bound), JuliaContext())]) end end - if ctx.maxthreads != nothing - bounds = CUDAdrv.CuDim3(ctx.maxthreads) + if job.maxthreads != nothing + bounds = CUDAdrv.CuDim3(job.maxthreads) for dim in (:x, :y, :z) bound = getfield(bounds, dim) append!(annotations, [MDString("maxntid$dim"), @@ -147,14 +147,14 @@ function promote_kernel!(ctx::CompilerContext, mod::LLVM.Module, entry_f::LLVM.F end end - if ctx.blocks_per_sm != nothing + if job.blocks_per_sm != nothing append!(annotations, [MDString("minctasm"), - ConstantInt(Int32(ctx.blocks_per_sm), JuliaContext())]) + ConstantInt(Int32(job.blocks_per_sm), JuliaContext())]) end - if ctx.maxregs != nothing + if job.maxregs != nothing append!(annotations, [MDString("maxnreg"), - ConstantInt(Int32(ctx.maxregs), JuliaContext())]) + ConstantInt(Int32(job.maxregs), JuliaContext())]) end @@ -178,12 +178,12 @@ function wrapper_type(julia_t::Type, codegen_t::LLVMType)::LLVMType end # generate a kernel wrapper to fix & improve argument passing -function wrap_entry!(ctx::CompilerContext, mod::LLVM.Module, entry_f::LLVM.Function) +function wrap_entry!(job::CompilerJob, mod::LLVM.Module, entry_f::LLVM.Function) entry_ft = eltype(llvmtype(entry_f)::LLVM.PointerType)::LLVM.FunctionType - @compiler_assert return_type(entry_ft) == LLVM.VoidType(JuliaContext()) ctx + @compiler_assert return_type(entry_ft) == LLVM.VoidType(JuliaContext()) job # filter out ghost types, which don't occur in the LLVM function signatures - sig = Base.signature_type(ctx.f, ctx.tt)::Type + sig = Base.signature_type(job.f, job.tt)::Type julia_types = Type[] for dt::Type in sig.parameters if !isghosttype(dt) @@ -216,8 +216,8 @@ function wrap_entry!(ctx::CompilerContext, mod::LLVM.Module, entry_f::LLVM.Funct if codegen_t != wrapper_t # the wrapper argument doesn't match the kernel parameter type. # this only happens when codegen wants to pass a pointer. - @compiler_assert isa(codegen_t, LLVM.PointerType) ctx - @compiler_assert eltype(codegen_t) == wrapper_t ctx + @compiler_assert isa(codegen_t, LLVM.PointerType) job + @compiler_assert eltype(codegen_t) == wrapper_t job # copy the argument value to a stack slot, and reference it. ptr = alloca!(builder, wrapper_t) @@ -299,7 +299,7 @@ end # such IR is hard to clean-up, so we probably will need to have the GC lowering pass emit # lower-level intrinsics which then can be lowered to architecture-specific code. function lower_gc_frame!(fun::LLVM.Function) - ctx = global_ctx::CompilerContext + job = current_job::CompilerJob mod = LLVM.parent(fun) changed = false @@ -330,7 +330,7 @@ function lower_gc_frame!(fun::LLVM.Function) changed = true end - @compiler_assert isempty(uses(alloc_obj)) ctx + @compiler_assert isempty(uses(alloc_obj)) job end # we don't care about write barriers @@ -343,7 +343,7 @@ function lower_gc_frame!(fun::LLVM.Function) changed = true end - @compiler_assert isempty(uses(barrier)) ctx + @compiler_assert isempty(uses(barrier)) job end return changed @@ -357,7 +357,7 @@ end # TODO: maybe don't have Julia emit actual uses of the TLS, but use intrinsics instead, # making it easier to remove or reimplement that functionality here. function lower_ptls!(mod::LLVM.Module) - ctx = global_ctx::CompilerContext + job = current_job::CompilerJob changed = false if haskey(functions(mod), "julia.ptls_states") @@ -372,7 +372,7 @@ function lower_ptls!(mod::LLVM.Module) changed = true end - @compiler_assert isempty(uses(ptls_getter)) ctx + @compiler_assert isempty(uses(ptls_getter)) job end return changed diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl index 58b6c9a5..d99dcccd 100644 --- a/src/compiler/rtlib.jl +++ b/src/compiler/rtlib.jl @@ -1,6 +1,6 @@ # compiler support for working with run-time libraries -function link_library!(ctx::CompilerContext, mod::LLVM.Module, lib::LLVM.Module) +function link_library!(job::CompilerJob, mod::LLVM.Module, lib::LLVM.Module) # linking is destructive, so copy the library lib = LLVM.Module(lib) @@ -61,12 +61,12 @@ function load_libdevice(cap) end end -function link_libdevice!(ctx::CompilerContext, mod::LLVM.Module, lib::LLVM.Module) +function link_libdevice!(job::CompilerJob, mod::LLVM.Module, lib::LLVM.Module) # override libdevice's triple and datalayout to avoid warnings triple!(lib, triple(mod)) datalayout!(lib, datalayout(mod)) - link_library!(ctx, mod, lib) + link_library!(job, mod, lib) ModulePassManager() do pm push!(metadata(mod), "nvvm-reflect-ftz", diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl index 1c225f44..aa9fb682 100644 --- a/src/compiler/validation.jl +++ b/src/compiler/validation.jl @@ -1,19 +1,19 @@ # validation of properties and code -function check_method(ctx::CompilerContext) - isa(ctx.f, Core.Builtin) && throw(KernelError(ctx, "function is not a generic function")) +function check_method(job::CompilerJob) + isa(job.f, Core.Builtin) && throw(KernelError(job, "function is not a generic function")) # get the method - ms = Base.methods(ctx.f, ctx.tt) - isempty(ms) && throw(KernelError(ctx, "no method found")) - length(ms)!=1 && throw(KernelError(ctx, "no unique matching method")) + ms = Base.methods(job.f, job.tt) + isempty(ms) && throw(KernelError(job, "no method found")) + length(ms)!=1 && throw(KernelError(job, "no unique matching method")) m = first(ms) # kernels can't return values - if ctx.kernel - rt = Base.return_types(ctx.f, ctx.tt)[1] + if job.kernel + rt = Base.return_types(job.f, job.tt)[1] if rt != Nothing - throw(KernelError(ctx, "kernel returns a value of type `$rt`", + throw(KernelError(job, "kernel returns a value of type `$rt`", """Make sure your kernel function ends in `return`, `return nothing` or `nothing`. If the returned value is of type `Union{}`, your Julia code probably throws an exception. Inspect the code with `@device_code_warntype` for more details.""")) @@ -23,10 +23,10 @@ function check_method(ctx::CompilerContext) return end -function check_invocation(ctx::CompilerContext, entry::LLVM.Function) +function check_invocation(job::CompilerJob, entry::LLVM.Function) # make sure any non-isbits arguments are unused real_arg_i = 0 - sig = Base.signature_type(ctx.f, ctx.tt)::Type + sig = Base.signature_type(job.f, job.tt)::Type for (arg_i,dt) in enumerate(sig.parameters) isghosttype(dt) && continue real_arg_i += 1 @@ -34,7 +34,7 @@ function check_invocation(ctx::CompilerContext, entry::LLVM.Function) if !isbitstype(dt) param = parameters(entry)[real_arg_i] if !isempty(uses(param)) - throw(KernelError(ctx, "passing and using non-bitstype argument", + throw(KernelError(job, "passing and using non-bitstype argument", """Argument $arg_i to your kernel function is of type $dt. That type is not isbits, and such arguments are only allowed when they are unused by the kernel.""")) end @@ -50,7 +50,7 @@ end const IRError = Tuple{String, StackTraces.StackTrace, Any} # kind, bt, meta struct InvalidIRError <: Exception - ctx::CompilerContext + job::CompilerJob errors::Vector{IRError} end @@ -59,7 +59,7 @@ const UNKNOWN_FUNCTION = "call to an unknown function" const POINTER_FUNCTION = "call through a literal pointer" function Base.showerror(io::IO, err::InvalidIRError) - print(io, "InvalidIRError: compiling $(signature(err.ctx)) resulted in invalid LLVM IR") + print(io, "InvalidIRError: compiling $(signature(err.job)) resulted in invalid LLVM IR") for (kind, bt, meta) in err.errors print(io, "\nReason: unsupported $kind") if meta != nothing @@ -72,28 +72,28 @@ function Base.showerror(io::IO, err::InvalidIRError) return end -function check_ir(ctx, args...) - errors = check_ir!(ctx, IRError[], args...) +function check_ir(job, args...) + errors = check_ir!(job, IRError[], args...) unique!(errors) if !isempty(errors) - throw(InvalidIRError(ctx, errors)) + throw(InvalidIRError(job, errors)) end return end -function check_ir!(ctx, errors::Vector{IRError}, mod::LLVM.Module) +function check_ir!(job, errors::Vector{IRError}, mod::LLVM.Module) for f in functions(mod) - check_ir!(ctx, errors, f) + check_ir!(job, errors, f) end return errors end -function check_ir!(ctx, errors::Vector{IRError}, f::LLVM.Function) +function check_ir!(job, errors::Vector{IRError}, f::LLVM.Function) for bb in blocks(f), inst in instructions(bb) if isa(inst, LLVM.CallInst) - check_ir!(ctx, errors, inst) + check_ir!(job, errors, inst) end end @@ -111,7 +111,7 @@ const special_fns = ( const libjulia = Ref{Ptr{Cvoid}}(C_NULL) -function check_ir!(ctx, errors::Vector{IRError}, inst::LLVM.CallInst) +function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst) dest = called_value(inst) if isa(dest, LLVM.Function) fn = LLVM.name(dest) @@ -143,7 +143,7 @@ function check_ir!(ctx, errors::Vector{IRError}, inst::LLVM.CallInst) if occursin("inttoptr", string(dest)) # extract the literal pointer ptr_arg = first(operands(dest)) - @compiler_assert isa(ptr_arg, ConstantInt) ctx + @compiler_assert isa(ptr_arg, ConstantInt) job ptr_val = convert(Int, ptr_arg) ptr = Ptr{Cvoid}(ptr_val) @@ -151,7 +151,7 @@ function check_ir!(ctx, errors::Vector{IRError}, inst::LLVM.CallInst) bt = backtrace(inst) frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0) if length(frames) >= 1 - @compiler_assert length(frames) == 1 ctx frames=frames + @compiler_assert length(frames) == 1 job frames=frames fn, file, line, linfo, fromC, inlined, ip = last(frames) push!(errors, (POINTER_FUNCTION, bt, fn)) else diff --git a/src/reflection.jl b/src/reflection.jl index 1b23388f..d92ae900 100644 --- a/src/reflection.jl +++ b/src/reflection.jl @@ -31,13 +31,13 @@ function code_llvm(io::IO, @nospecialize(func::Core.Function), @nospecialize(typ dump_module::Bool=false, strip_ir_metadata::Bool=true, kernel::Bool=false, kwargs...) tt = Base.to_tuple_type(types) - ctx = CompilerContext(func, tt, cap, kernel; kwargs...) - code_llvm(io, ctx; optimize=optimize, dump_module=dump_module, + job = CompilerJob(func, tt, cap, kernel; kwargs...) + code_llvm(io, job; optimize=optimize, dump_module=dump_module, strip_ir_metadata=strip_ir_metadata) end -function code_llvm(io::IO, ctx::CompilerContext; optimize::Bool=true, +function code_llvm(io::IO, job::CompilerJob; optimize::Bool=true, dump_module::Bool=false, strip_ir_metadata::Bool=true) - ir, entry = compile(:llvm, ctx; hooks=false, optimize=optimize, strip=strip_ir_metadata) + ir, entry = compile(:llvm, job; hooks=false, optimize=optimize, strip=strip_ir_metadata) if dump_module show(io, ir) else @@ -63,11 +63,11 @@ function code_ptx(io::IO, @nospecialize(func::Core.Function), @nospecialize(type cap::VersionNumber=current_capability(), kernel::Bool=false, strip_ir_metadata::Bool=true, kwargs...) tt = Base.to_tuple_type(types) - ctx = CompilerContext(func, tt, cap, kernel; kwargs...) - code_ptx(io, ctx; strip_ir_metadata=strip_ir_metadata) + job = CompilerJob(func, tt, cap, kernel; kwargs...) + code_ptx(io, job; strip_ir_metadata=strip_ir_metadata) end -function code_ptx(io::IO, ctx::CompilerContext; strip_ir_metadata::Bool=true) - asm, _ = compile(:ptx, ctx; hooks=false, strip=strip_ir_metadata) +function code_ptx(io::IO, job::CompilerJob; strip_ir_metadata::Bool=true) + asm, _ = compile(:ptx, job; hooks=false, strip=strip_ir_metadata) print(io, asm) end code_ptx(@nospecialize(func), @nospecialize(types); kwargs...) = @@ -87,21 +87,21 @@ See also: [`@device_code_sass`](@ref) function code_sass(io::IO, @nospecialize(func::Core.Function), @nospecialize(types); cap::VersionNumber=current_capability(), kernel::Bool=true, kwargs...) tt = Base.to_tuple_type(types) - ctx = CompilerContext(func, tt, cap, kernel; kwargs...) - code_sass(io, ctx) + job = CompilerJob(func, tt, cap, kernel; kwargs...) + code_sass(io, job) end -function code_sass(io::IO, ctx::CompilerContext) - if !ctx.kernel +function code_sass(io::IO, job::CompilerJob) + if !job.kernel error("Can only generate SASS code for kernel functions") end if ptxas === nothing || nvdisasm === nothing error("Your CUDA installation does not provide ptxas or nvdisasm, both of which are required for code_sass") end - ptx, _ = compile(:ptx, ctx; hooks=false) + ptx, _ = compile(:ptx, job; hooks=false) fn = tempname() - gpu = "sm_$(ctx.cap.major)$(ctx.cap.minor)" + gpu = "sm_$(job.cap.major)$(job.cap.minor)" # NOTE: this might not match what is being executed, due to the PTX->SASS conversion # by the driver possibly not matching what `ptxas` (part of the toolkit) does. # TODO: see how `nvvp` extracts SASS code when doing PC sampling, and copy that. @@ -142,9 +142,9 @@ function emit_hooked_compilation(inner_hook, ex...) empty!(CUDAnative.compilecache) local kernels = 0 - function outer_hook(ctx) + function outer_hook(job) kernels += 1 - $inner_hook(ctx; $(map(esc, user_kwargs)...)) + $inner_hook(job; $(map(esc, user_kwargs)...)) end if CUDAnative.compile_hook[] != nothing @@ -184,8 +184,8 @@ See also: [`InteractiveUtils.@code_lowered`](@ref) macro device_code_lowered(ex...) quote buf = Any[] - function hook(ctx::CompilerContext) - append!(buf, code_lowered(ctx.f, ctx.tt)) + function hook(job::CompilerJob) + append!(buf, code_lowered(job.f, job.tt)) end $(emit_hooked_compilation(:hook, ex...)) buf @@ -203,8 +203,8 @@ See also: [`InteractiveUtils.@code_typed`](@ref) macro device_code_typed(ex...) quote buf = Any[] - function hook(ctx::CompilerContext) - append!(buf, code_typed(ctx.f, ctx.tt)) + function hook(job::CompilerJob) + append!(buf, code_typed(job.f, job.tt)) end $(emit_hooked_compilation(:hook, ex...)) buf @@ -220,8 +220,8 @@ Evaluates the expression `ex` and prints the result of See also: [`InteractiveUtils.@code_warntype`](@ref) """ macro device_code_warntype(ex...) - function hook(ctx::CompilerContext; io::IO=stdout, kwargs...) - code_warntype(io, ctx.f, ctx.tt; kwargs...) + function hook(job::CompilerJob; io::IO=stdout, kwargs...) + code_warntype(io, job.f, job.tt; kwargs...) end emit_hooked_compilation(hook, ex...) end @@ -236,7 +236,7 @@ to `io` for every compiled CUDA kernel. For other supported keywords, see See also: [`InteractiveUtils.@code_llvm`](@ref) """ macro device_code_llvm(ex...) - hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_llvm(io, ctx; kwargs...) + hook(job::CompilerJob; io::IO=stdout, kwargs...) = code_llvm(io, job; kwargs...) emit_hooked_compilation(hook, ex...) end @@ -248,7 +248,7 @@ for every compiled CUDA kernel. For other supported keywords, see [`CUDAnative.code_ptx`](@ref). """ macro device_code_ptx(ex...) - hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_ptx(io, ctx; kwargs...) + hook(job::CompilerJob; io::IO=stdout, kwargs...) = code_ptx(io, job; kwargs...) emit_hooked_compilation(hook, ex...) end @@ -260,7 +260,7 @@ Evaluates the expression `ex` and prints the result of [`CUDAnative.code_sass`]( [`CUDAnative.code_sass`](@ref). """ macro device_code_sass(ex...) - hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_sass(io, ctx; kwargs...) + hook(job::CompilerJob; io::IO=stdout, kwargs...) = code_sass(io, job; kwargs...) emit_hooked_compilation(hook, ex...) end @@ -272,34 +272,34 @@ Evaluates the expression `ex` and dumps all intermediate forms of code to the di """ macro device_code(ex...) only(xs) = (@assert length(xs) == 1; first(xs)) - function hook(ctx::CompilerContext; dir::AbstractString) - fn = "$(typeof(ctx.f).name.mt.name)_$(globalUnique+1)" + function hook(job::CompilerJob; dir::AbstractString) + fn = "$(typeof(job.f).name.mt.name)_$(globalUnique+1)" mkpath(dir) open(joinpath(dir, "$fn.lowered.jl"), "w") do io - code = only(code_lowered(ctx.f, ctx.tt)) + code = only(code_lowered(job.f, job.tt)) println(io, code) end open(joinpath(dir, "$fn.typed.jl"), "w") do io - code = only(code_typed(ctx.f, ctx.tt)) + code = only(code_typed(job.f, job.tt)) println(io, code) end open(joinpath(dir, "$fn.unopt.ll"), "w") do io - code_llvm(io, ctx; dump_module=true, strip_ir_metadata=false, optimize=false) + code_llvm(io, job; dump_module=true, strip_ir_metadata=false, optimize=false) end open(joinpath(dir, "$fn.opt.ll"), "w") do io - code_llvm(io, ctx; dump_module=true, strip_ir_metadata=false) + code_llvm(io, job; dump_module=true, strip_ir_metadata=false) end open(joinpath(dir, "$fn.ptx"), "w") do io - code_ptx(io, ctx) + code_ptx(io, job) end open(joinpath(dir, "$fn.sass"), "w") do io - code_sass(io, ctx) + code_sass(io, job) end end emit_hooked_compilation(hook, ex...) From 9dabf73bd78ad4e022560422c4e9a253e42346da Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Mar 2019 13:50:04 +0100 Subject: [PATCH 23/34] Bugfix. --- src/execution.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/execution.jl b/src/execution.jl index 428f586a..afe3ca18 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -425,7 +425,7 @@ end append!(ex.args, (quote #GC.@preserve $(converted_args...) begin - launch(f, blocks, threads, shmem, stream, ($(arg_ptrs...),)) + launch(f, blocks, threads, shmem, stream, $(arg_ptrs...)) #end end).args) From e2fefdaadee486bad676751491eee883db6c07c9 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Mar 2019 14:31:04 +0100 Subject: [PATCH 24/34] Iterative compilation to deal with nested recursion. --- Project.toml | 1 + src/CUDAnative.jl | 1 + src/compiler/driver.jl | 116 ++++++++++++++++++++++++----------------- 3 files changed, 70 insertions(+), 48 deletions(-) diff --git a/Project.toml b/Project.toml index a62a301c..fd44b603 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3" CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" +DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index 018729ae..fab16f49 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -6,6 +6,7 @@ using LLVM using LLVM.Interop using Adapt +using DataStructures using Pkg using Libdl diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 9687b050..d6e6544f 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -62,7 +62,7 @@ function compile(to::Symbol, job::CompilerJob; ## LLVM IR - ir, entry = irgen(job, linfo, world) + ir, kernel = irgen(job, linfo, world) need_library(lib) = any(f -> isdeclaration(f) && intrinsic_id(f) == 0 && @@ -76,7 +76,7 @@ function compile(to::Symbol, job::CompilerJob; # optimize the IR if optimize - entry = optimize!(job, ir, entry) + kernel = optimize!(job, ir, kernel) end runtime = load_runtime(job.cap) @@ -90,68 +90,88 @@ function compile(to::Symbol, job::CompilerJob; strip_debuginfo!(ir) end + kernel_fn = LLVM.name(kernel) + kernel_ft = eltype(llvmtype(kernel)) + + to == :llvm && return ir, kernel + ## dynamic parallelism + kernels = OrderedDict{CompilerJob, String}(job => kernel_fn) + if haskey(functions(ir), "cudanativeCompileKernel") - f = functions(ir)["cudanativeCompileKernel"] - - # find dynamic kernel invocations - # TODO: recover this information earlier, from the Julia IR - worklist = Dict{Tuple{Core.Function,Type}, Vector{LLVM.CallInst}}() - for use in uses(f) - # decode the call - call = user(use)::LLVM.CallInst - ops = collect(operands(call))[1:2] - ## addrspacecast - ops = LLVM.Value[first(operands(val)) for val in ops] - ## inttoptr - ops = ConstantInt[first(operands(val)) for val in ops] - ## integer constants - ops = convert.(Int, ops) - ## actual pointer values - ops = Ptr{Any}.(ops) - - dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops) - calls = get!(worklist, (dyn_f, dyn_tt), LLVM.CallInst[]) - push!(calls, call) - end + dyn_maker = functions(ir)["cudanativeCompileKernel"] + + # iterative compilation (non-recursive) + changed = true + while changed + changed = false + + # find dynamic kernel invocations + # TODO: recover this information earlier, from the Julia IR + worklist = MultiDict{CompilerJob, LLVM.CallInst}() + for use in uses(dyn_maker) + # decode the call + call = user(use)::LLVM.CallInst + ops = collect(operands(call))[1:2] + ## addrspacecast + ops = LLVM.Value[first(operands(val)) for val in ops] + ## inttoptr + ops = ConstantInt[first(operands(val)) for val in ops] + ## integer constants + ops = convert.(Int, ops) + ## actual pointer values + ops = Ptr{Any}.(ops) + + dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops) + dyn_job = CompilerJob(dyn_f, dyn_tt, job.cap, #=kernel=# true) + push!(worklist, dyn_job => call) + end - # compile and link - for (dyn_f, dyn_tt) in keys(worklist) - dyn_ctx = CompilerJob(dyn_f, dyn_tt, job.cap, true) - dyn_ir, dyn_entry = - compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip) - - dyn_fn = LLVM.name(dyn_entry) - link!(ir, dyn_ir) - dyn_ir = nothing - dyn_entry = functions(ir)[dyn_fn] - - # insert a call everywhere the kernel is used - for call in worklist[(dyn_f,dyn_tt)] - replace_uses!(call, dyn_entry) - unsafe_delete!(LLVM.parent(call), call) + # compile and link + for dyn_job in keys(worklist) + # cached compilation + dyn_kernel_fn = get!(kernels, dyn_job) do + dyn_ir, dyn_kernel = compile(:llvm, dyn_job; hooks=false, + optimize=optimize, strip=strip) + dyn_kernel_fn = LLVM.name(dyn_kernel) + dyn_kernel_ft = eltype(llvmtype(dyn_kernel)) + link!(ir, dyn_ir) + changed = true + dyn_kernel_fn + end + dyn_kernel = functions(ir)[dyn_kernel_fn] + + # insert a pointer to the function everywhere the kernel is used + T_ptr = convert(LLVMType, Ptr{Cvoid}) + for call in worklist[dyn_job] + Builder(JuliaContext()) do builder + position!(builder, call) + fptr = ptrtoint!(builder, dyn_kernel, T_ptr) + replace_uses!(call, fptr) + end + unsafe_delete!(LLVM.parent(call), call) + end end end - @compiler_assert isempty(uses(f)) job - unsafe_delete!(ir, f) + # all dynamic launches should have been resolved + @compiler_assert isempty(uses(dyn_maker)) job + unsafe_delete!(ir, dyn_maker) end - to == :llvm && return ir, entry - ## PTX machine code prepare_execution!(job, ir) - check_invocation(job, entry) + check_invocation(job, kernel) check_ir(job, ir) - asm = mcgen(job, ir, entry) + asm = mcgen(job, ir, kernel) - to == :ptx && return asm, LLVM.name(entry) + to == :ptx && return asm, kernel_fn ## CUDA objects @@ -167,11 +187,11 @@ function compile(to::Symbol, job::CompilerJob; # link the CUDA device library linker = CUDAdrv.CuLink(jit_options) CUDAdrv.add_file!(linker, libcudadevrt, CUDAdrv.LIBRARY) - CUDAdrv.add_data!(linker, LLVM.name(entry), asm) + CUDAdrv.add_data!(linker, kernel_fn, asm) image = CUDAdrv.complete(linker) cuda_mod = CuModule(image, jit_options) - cuda_fun = CuFunction(cuda_mod, LLVM.name(entry)) + cuda_fun = CuFunction(cuda_mod, kernel_fn) to == :cuda && return cuda_fun, cuda_mod From ee21fbc90693f9010d2cd32fe15bb26ef70644c3 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Mar 2019 15:39:19 +0100 Subject: [PATCH 25/34] Call into Julia's GC pass for IR cleanup purposes. Without it, LLVM+asserts will complain. --- src/compiler/optim.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index f4f3302b..eae9ae3f 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -51,9 +51,18 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) ModulePassManager() do pm initialize!(pm) + + # lower intrinsics add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!)) aggressive_dce!(pm) # remove dead uses of ptls add!(pm, ModulePass("LowerPTLS", lower_ptls!)) + + # the Julia GC lowering pass also has some clean-up that is required + function LLVMAddLateLowerGCFramePass(PM::LLVM.API.LLVMPassManagerRef) + LLVM.@apicall(:LLVMExtraAddLateLowerGCFramePass,Cvoid,(LLVM.API.LLVMPassManagerRef,), PM) + end + LLVMAddLateLowerGCFramePass(LLVM.ref(pm)) + run!(pm, mod) end end From 82e6afc347ff7cddc2b76bdd8afe0b8d968d1f35 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Mar 2019 15:30:06 +0100 Subject: [PATCH 26/34] Document 265 issue. --- src/execution.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/execution.jl b/src/execution.jl index afe3ca18..bfbcb932 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -453,6 +453,7 @@ No keyword arguments are supported. else # we can't compile here, so drop a marker which will get picked up during compilation quote + # TODO: add an edge to this method instance to support method redefinitions fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt) DeviceKernel{f,tt}(fptr) From 1aba961ba34d70ad0d0bdd30ae3158fac6668975 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Mar 2019 15:35:30 +0100 Subject: [PATCH 27/34] Add tests. --- test/device/execution.jl | 92 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/test/device/execution.jl b/test/device/execution.jl index 3d37c040..86a7d8e5 100644 --- a/test/device/execution.jl +++ b/test/device/execution.jl @@ -807,4 +807,96 @@ end ############################################################################################ +@testset "dynamic parallelism" begin + +@testset "basic usage" begin + function hello() + @cuprintf("Hello, ") + @cuda dynamic=true world() + return + end + + @eval function world() + @cuprintf("World!") + return + end + + _, out = @grab_output begin + @cuda hello() + synchronize() + end + @test out == "Hello, World!" +end + +@testset "argument passing" begin + function kernel(a, b, c) + @cuprintf("%ld %ld %ld", Int64(a), Int64(b), Int64(c)) + return + end + + for args in ((Int16(1), Int32(2), Int64(3)), # padding + (Int32(1), Int32(2), Int32(3)), # no padding, equal size + (Int64(1), Int32(2), Int16(3)), # no padding, inequal size + (Int16(1), Int64(2), Int32(3))) # mixed + _, out = @grab_output begin + @cuda kernel(args...) + synchronize() + end + @test out == "1 2 3" + end +end + +@testset "self-recursion" begin + @eval function kernel(x::Bool) + if x + @cuprintf("recurse ") + @cuda dynamic=true kernel(false) + else + @cuprintf("stop") + end + return + end + + _, out = @grab_output begin + @cuda kernel(true) + synchronize() + end + @test out == "recurse stop" +end + +@testset "deep recursion" begin + @eval function kernel_a(x::Bool) + @cuprintf("a ") + @cuda dynamic=true kernel_b(x) + return + end + + @eval function kernel_b(x::Bool) + @cuprintf("b ") + @cuda dynamic=true kernel_c(x) + return + end + + @eval function kernel_c(x::Bool) + @cuprintf("c ") + if x + @cuprintf("recurse ") + @cuda dynamic=true kernel_a(false) + else + @cuprintf("stop") + end + return + end + + _, out = @grab_output begin + @cuda kernel_a(true) + synchronize() + end + @test out == "a b c recurse a b c stop" +end + +end + +############################################################################################ + end From b685fc8866aa37571c83936e807038399c279de8 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 21 Mar 2019 16:23:03 +0100 Subject: [PATCH 28/34] Fixes to runtime library generation. --- src/compiler/driver.jl | 26 ++++++++++++++++---------- src/compiler/rtlib.jl | 3 ++- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index d6e6544f..cff5e345 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -14,19 +14,22 @@ set, specialized code generation and optimization for kernel functions is enable The following keyword arguments are supported: - `hooks`: enable compiler hooks that drive reflection functions (default: true) +- `libraries`: link auxiliary bitcode libraries that may be required (default: true) - `optimize`: optimize the code (default: true) - `strip`: strip non-functional metadata and debug information (default: false) Other keyword arguments can be found in the documentation of [`cufunction`](@ref). """ compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt), - kernel::Bool=true; hooks::Bool=true, optimize::Bool=true, strip::Bool=false, + kernel::Bool=true; hooks::Bool=true, libraries::Bool=true, + optimize::Bool=true, strip::Bool=false, kwargs...) = compile(to, CompilerJob(f, tt, cap, kernel; kwargs...); - hooks=hooks, optimize=optimize, strip=strip) + hooks=hooks, libraries=libraries, optimize=optimize, strip=strip) function compile(to::Symbol, job::CompilerJob; - hooks::Bool=true, optimize::Bool=true, strip::Bool=false) + hooks::Bool=true, libraries::Bool=true, + optimize::Bool=true, strip::Bool=false) @debug "(Re)compiling function" job if hooks && compile_hook[] != nothing @@ -69,19 +72,22 @@ function compile(to::Symbol, job::CompilerJob; haskey(functions(lib), LLVM.name(f)), functions(ir)) - libdevice = load_libdevice(job.cap) - if need_library(libdevice) - link_libdevice!(job, ir, libdevice) + if libraries + libdevice = load_libdevice(job.cap) + if need_library(libdevice) + link_libdevice!(job, ir, libdevice) + end end - # optimize the IR if optimize kernel = optimize!(job, ir, kernel) end - runtime = load_runtime(job.cap) - if need_library(runtime) - link_library!(job, ir, runtime) + if libraries + runtime = load_runtime(job.cap) + if need_library(runtime) + link_library!(job, ir, runtime) + end end verify(ir) diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl index d99dcccd..5a9f4556 100644 --- a/src/compiler/rtlib.jl +++ b/src/compiler/rtlib.jl @@ -124,7 +124,8 @@ end function emit_function!(mod, cap, f, types, name) tt = Base.to_tuple_type(types) - new_mod, entry = compile(:llvm, cap, f, tt, #=kernel=# false; hooks=false) + new_mod, entry = compile(:llvm, cap, f, tt, #=kernel=# false; + hooks=false, libraries=false) LLVM.name!(entry, name) link!(mod, new_mod) end From 91ed41d942ad7f17670ecb52a880c363b0b4d076 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 22 Mar 2019 09:40:05 +0100 Subject: [PATCH 29/34] Simplify delayed compilation markers. --- src/compiler/driver.jl | 15 ++++----------- src/execution.jl | 15 +++++++++++---- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index cff5e345..3ca05945 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -120,17 +120,10 @@ function compile(to::Symbol, job::CompilerJob; for use in uses(dyn_maker) # decode the call call = user(use)::LLVM.CallInst - ops = collect(operands(call))[1:2] - ## addrspacecast - ops = LLVM.Value[first(operands(val)) for val in ops] - ## inttoptr - ops = ConstantInt[first(operands(val)) for val in ops] - ## integer constants - ops = convert.(Int, ops) - ## actual pointer values - ops = Ptr{Any}.(ops) - - dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops) + id = convert(Int, first(operands(call))) + + global delayed_cufunctions + dyn_f, dyn_tt = delayed_cufunctions[id] dyn_job = CompilerJob(dyn_f, dyn_tt, job.cap, #=kernel=# true) push!(worklist, dyn_job => call) end diff --git a/src/execution.jl b/src/execution.jl index bfbcb932..7c6f861b 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -443,7 +443,11 @@ a callable kernel object. Device-side equivalent of [`CUDAnative.cufunction`](@r No keyword arguments are supported. """ -@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) +@inline dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) = + delayed_cufunction(Val(f), Val(tt)) + +const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}() +@generated function delayed_cufunction(::Val{f}, ::Val{tt}) where {f,tt} if sizeof(f) > 0 Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures") quote @@ -451,11 +455,14 @@ No keyword arguments are supported. DeviceKernel{f,tt}(C_NULL) end else - # we can't compile here, so drop a marker which will get picked up during compilation + global delayed_cufunctions + push!(delayed_cufunctions, (f,tt)) + id = length(delayed_cufunctions) + + # drop a marker which will get picked up during compilation quote # TODO: add an edge to this method instance to support method redefinitions - fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, - (Any, Any), f, tt) + fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), $id) DeviceKernel{f,tt}(fptr) end end From d4d6aca29cf118a9c24ac8e97b82abdb998a7d0a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 22 Mar 2019 10:01:06 +0100 Subject: [PATCH 30/34] Allow and test anonymous functions and closures. --- src/execution.jl | 23 ++++++++--------------- test/device/execution.jl | 31 +++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/src/execution.jl b/src/execution.jl index 7c6f861b..12011364 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -448,23 +448,16 @@ No keyword arguments are supported. const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}() @generated function delayed_cufunction(::Val{f}, ::Val{tt}) where {f,tt} - if sizeof(f) > 0 - Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures") - quote - trap() - DeviceKernel{f,tt}(C_NULL) - end - else - global delayed_cufunctions - push!(delayed_cufunctions, (f,tt)) - id = length(delayed_cufunctions) + global delayed_cufunctions + push!(delayed_cufunctions, (f,tt)) + id = length(delayed_cufunctions) + quote # drop a marker which will get picked up during compilation - quote - # TODO: add an edge to this method instance to support method redefinitions - fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), $id) - DeviceKernel{f,tt}(fptr) - end + # TODO: add an edge to this method instance to support method redefinitions + fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), $id) + + DeviceKernel{f,tt}(fptr) end end diff --git a/test/device/execution.jl b/test/device/execution.jl index 86a7d8e5..edc8c107 100644 --- a/test/device/execution.jl +++ b/test/device/execution.jl @@ -828,6 +828,37 @@ end @test out == "Hello, World!" end +@testset "anonymous functions" begin + function hello() + @cuprintf("Hello, ") + world = () -> (@cuprintf("World!"); nothing) + @cuda dynamic=true world() + return + end + + _, out = @grab_output begin + @cuda hello() + synchronize() + end + @test out == "Hello, World!" +end + +@testset "closures" begin + function hello() + x = 1 + @cuprintf("Hello, ") + world = () -> (@cuprintf("World %ld!", x); nothing) + @cuda dynamic=true world() + return + end + + _, out = @grab_output begin + @cuda hello() + synchronize() + end + @test out == "Hello, World 1!" +end + @testset "argument passing" begin function kernel(a, b, c) @cuprintf("%ld %ld %ld", Int64(a), Int64(b), Int64(c)) From a2a07a6e1880d63d3262eff1075204367499c478 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 22 Mar 2019 10:39:42 +0100 Subject: [PATCH 31/34] Add wrappers for older versions of Julia. --- src/compiler/optim.jl | 9 +++-- src/device/cuda/libcudadevrt.jl | 72 +++++++++++++++++++++++++++++---- src/execution.jl | 17 ++++++-- 3 files changed, 84 insertions(+), 14 deletions(-) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index eae9ae3f..887078fb 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -58,10 +58,13 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) add!(pm, ModulePass("LowerPTLS", lower_ptls!)) # the Julia GC lowering pass also has some clean-up that is required - function LLVMAddLateLowerGCFramePass(PM::LLVM.API.LLVMPassManagerRef) - LLVM.@apicall(:LLVMExtraAddLateLowerGCFramePass,Cvoid,(LLVM.API.LLVMPassManagerRef,), PM) + if VERSION >= v"1.2.0-DEV.520" + # TODO: move this to LLVM.jl + function LLVMAddLateLowerGCFramePass(PM::LLVM.API.LLVMPassManagerRef) + LLVM.@apicall(:LLVMExtraAddLateLowerGCFramePass,Cvoid,(LLVM.API.LLVMPassManagerRef,), PM) + end + LLVMAddLateLowerGCFramePass(LLVM.ref(pm)) end - LLVMAddLateLowerGCFramePass(LLVM.ref(pm)) run!(pm, mod) end diff --git a/src/device/cuda/libcudadevrt.jl b/src/device/cuda/libcudadevrt.jl index 59f7c969..03ca74c8 100644 --- a/src/device/cuda/libcudadevrt.jl +++ b/src/device/cuda/libcudadevrt.jl @@ -9,6 +9,25 @@ import CUDAdrv: CuDim3, CuStream_t const cudaError_t = Cint const cudaStream_t = CUDAdrv.CuStream_t +if VERSION >= v"1.2.0-DEV.512" + @inline cudaLaunchDevice(buf::Ptr{Cvoid}, stream::CuStream) = + ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t, + (Ptr{Cvoid}, cudaStream_t), + buf, stream) +else + import Base.Sys: WORD_SIZE + # declare i32 @cudaLaunchDeviceV2(i8*, %struct.CUstream_st*) + @eval @inline cudaLaunchDevice(buf::Ptr{Cvoid}, stream::CuStream) = + Base.llvmcall( + ( "declare i32 @cudaLaunchDeviceV2(i8*, i8*)", + $"%buf = inttoptr i$WORD_SIZE %0 to i8* + %stream = inttoptr i$WORD_SIZE %1 to i8* + %rv = call i32 @cudaLaunchDeviceV2(i8* %buf, i8* %stream) + ret i32 %rv"), cudaError_t, + Tuple{Ptr{Cvoid}, cudaStream_t}, + buf, Base.unsafe_convert(cudaStream_t, stream)) +end + # device-side counterpart of CUDAdrv.launch @inline function launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim, shmem::Int, stream::CuStream, @@ -17,21 +36,48 @@ const cudaStream_t = CUDAdrv.CuStream_t threads = CuDim3(threads) buf = parameter_buffer(f, blocks, threads, shmem, args...) - - ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t, - (Ptr{Cvoid}, cudaStream_t), - buf, stream) + cudaLaunchDevice(buf, stream) return end +if VERSION >= v"1.2.0-DEV.512" + @inline cudaGetParameterBuffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, shmem::Integer) = + ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid}, + (Ptr{Cvoid}, CuDim3, CuDim3, Cuint), + f, blocks, threads, shmem) +else + @inline cudaGetParameterBuffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, shmem::Integer) = + cudaGetParameterBuffer(f, + blocks.x, blocks.y, blocks.z, + threads.x, threads.y, threads.z, + convert(Cuint, shmem)) + # declare i8* @cudaGetParameterBufferV2(i8*, %struct.dim3, %struct.dim3, i32) + @eval @inline cudaGetParameterBuffer(f::Ptr{Cvoid}, + blocks_x::Cuint, blocks_y::Cuint, blocks_z::Cuint, + threads_x::Cuint, threads_y::Cuint, threads_z::Cuint, + shmem::Cuint) = + Base.llvmcall( + ( "declare i8* @cudaGetParameterBufferV2(i8*, {i32,i32,i32}, {i32,i32,i32}, i32)", + $"%f = inttoptr i$WORD_SIZE %0 to i8* + %blocks.x = insertvalue { i32, i32, i32 } undef, i32 %1, 0 + %blocks.y = insertvalue { i32, i32, i32 } %blocks.x, i32 %2, 1 + %blocks.z = insertvalue { i32, i32, i32 } %blocks.y, i32 %3, 2 + %threads.x = insertvalue { i32, i32, i32 } undef, i32 %4, 0 + %threads.y = insertvalue { i32, i32, i32 } %threads.x, i32 %5, 1 + %threads.z = insertvalue { i32, i32, i32 } %threads.y, i32 %6, 2 + %rv = call i8* @cudaGetParameterBufferV2(i8* %f, {i32,i32,i32} %blocks.z, {i32,i32,i32} %threads.z, i32 %7) + %buf = ptrtoint i8* %rv to i$WORD_SIZE + ret i$WORD_SIZE %buf"), Ptr{Cvoid}, + Tuple{Ptr{Cvoid}, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint}, + f, blocks_x, blocks_y, blocks_z, threads_x, threads_y, threads_z, shmem) +end + @generated function parameter_buffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, shmem::Int, args...) # allocate a buffer ex = quote - buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid}, - (Ptr{Cvoid}, CuDim3, CuDim3, Cuint), - f, blocks, threads, shmem) + buf = cudaGetParameterBuffer(f, blocks, threads, shmem) end # store the parameters @@ -55,6 +101,16 @@ end return ex end +if VERSION >= v"1.2.0-DEV.512" + @inline synchronize() = ccall("extern cudaDeviceSynchronize", llvmcall, Cint, ()) +else + @eval @inline synchronize() = + Base.llvmcall( + ("declare i32 @cudaDeviceSynchronize()", + "%rv = call i32 @cudaDeviceSynchronize() + ret i32 %rv"), cudaError_t, Tuple{}) +end + """ synchronize() @@ -64,4 +120,4 @@ and should not be called from the host. `synchronize` acts as a synchronization point for child grids in the context of dynamic parallelism. """ -@inline synchronize() = ccall("extern cudaDeviceSynchronize", llvmcall, Cint, ()) +synchronize diff --git a/src/execution.jl b/src/execution.jl index 12011364..28fb485e 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -446,6 +446,19 @@ No keyword arguments are supported. @inline dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) = delayed_cufunction(Val(f), Val(tt)) +# marker function that will get picked up during compilation +if VERSION >= v"1.2.0-DEV.512" + @inline cudanativeCompileKernel(id::Int) = + ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), id) +else + import Base.Sys: WORD_SIZE + @eval @inline cudanativeCompileKernel(id::Int) = + Base.llvmcall( + ($"declare i$WORD_SIZE @cudanativeCompileKernel(i$WORD_SIZE)", + $"%rv = call i$WORD_SIZE @cudanativeCompileKernel(i$WORD_SIZE %0) + ret i$WORD_SIZE %rv"), Ptr{Cvoid}, Tuple{Int}, id) +end + const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}() @generated function delayed_cufunction(::Val{f}, ::Val{tt}) where {f,tt} global delayed_cufunctions @@ -453,10 +466,8 @@ const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}() id = length(delayed_cufunctions) quote - # drop a marker which will get picked up during compilation # TODO: add an edge to this method instance to support method redefinitions - fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), $id) - + fptr = cudanativeCompileKernel($id) DeviceKernel{f,tt}(fptr) end end From 8d2131f805bbcf29c3105aa86f662523cac257e6 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 22 Mar 2019 10:54:35 +0100 Subject: [PATCH 32/34] Export dynamic_cufunction. --- src/execution.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/execution.jl b/src/execution.jl index 28fb485e..079edb19 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -1,6 +1,6 @@ # Native execution support -export @cuda, cudaconvert, cufunction, nearest_warpsize +export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize ## helper functions From 0144cfc1638ab3c4b0a4db3f6998425d02865c5b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 22 Mar 2019 11:07:38 +0100 Subject: [PATCH 33/34] Test complex argument passing. --- test/device/execution.jl | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/device/execution.jl b/test/device/execution.jl index edc8c107..96876afd 100644 --- a/test/device/execution.jl +++ b/test/device/execution.jl @@ -860,6 +860,8 @@ end end @testset "argument passing" begin + ## padding + function kernel(a, b, c) @cuprintf("%ld %ld %ld", Int64(a), Int64(b), Int64(c)) return @@ -875,6 +877,23 @@ end end @test out == "1 2 3" end + + ## conversion + + function kernel(a) + increment(a) = (a[1] += 1; nothing) + + a[1] = 1 + increment(a) + @cuda dynamic=true increment(a) + + return + end + + dA = CuTestArray{Int,1}((1,)) + @cuda kernel(dA) + A = Array(dA) + @test A == [3] end @testset "self-recursion" begin From da510866b5c4b6800d33fae6cb52beff9338296e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 22 Mar 2019 11:15:57 +0100 Subject: [PATCH 34/34] Version bound the closure+capture test. --- test/device/execution.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/device/execution.jl b/test/device/execution.jl index 96876afd..293c0701 100644 --- a/test/device/execution.jl +++ b/test/device/execution.jl @@ -843,6 +843,7 @@ end @test out == "Hello, World!" end +if VERSION >= v"1.1" # behavior of captured variables (box or not) has improved over time @testset "closures" begin function hello() x = 1 @@ -858,6 +859,7 @@ end end @test out == "Hello, World 1!" end +end @testset "argument passing" begin ## padding