From 8602a1e7b7972507ad5b34f744d4f84a28fe9b88 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 Mar 2019 15:45:10 +0100
Subject: [PATCH 01/34] Support for keyword arguments to at-cuda.

---
 src/execution.jl | 56 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index 8ae407f8..ff6a3404 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -63,15 +63,19 @@ end
 
 # split keyword arguments to `@cuda` into ones affecting the compiler, or the execution
 function split_kwargs(kwargs)
+    macro_kws    = [:dynamic]
     compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs]
     call_kws     = [:blocks, :threads, :shmem, :stream]
+    macro_kwargs = []
     compiler_kwargs = []
     call_kwargs = []
     for kwarg in kwargs
         if Meta.isexpr(kwarg, :(=))
             key,val = kwarg.args
             if isa(key, Symbol)
-                if key in compiler_kws
+                if key in macro_kws
+                    push!(macro_kwargs, kwarg)
+                elseif key in compiler_kws
                     push!(compiler_kwargs, kwarg)
                 elseif key in call_kws
                     push!(call_kwargs, kwarg)
@@ -86,7 +90,7 @@ function split_kwargs(kwargs)
         end
     end
 
-    return compiler_kwargs, call_kwargs
+    return macro_kwargs, compiler_kwargs, call_kwargs
 end
 
 # assign arguments to variables, handle splatting
@@ -195,20 +199,39 @@ macro cuda(ex...)
     args = call.args[2:end]
 
     code = quote end
-    compiler_kwargs, call_kwargs = split_kwargs(kwargs)
+    macro_kwargs, compiler_kwargs, call_kwargs = split_kwargs(kwargs)
     vars, var_exprs = assign_args!(code, args)
 
-    # convert the arguments, call the compiler and launch the kernel
-    # while keeping the original arguments alive
-    push!(code.args,
-        quote
-            GC.@preserve $(vars...) begin
-                local kernel_args = cudaconvert.(($(var_exprs...),))
-                local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
-                local kernel = cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...))
-                kernel(kernel_args...; $(map(esc, call_kwargs)...))
-            end
-         end)
+    # handle keyword arguments that influence the macro's behavior
+    dynamic = false
+    for kwarg in macro_kwargs
+        key,val = kwarg.args
+        if key == :dynamic
+            dynamic = val::Bool
+        else
+            throw(ArgumentError("Unsupported keyword argument '$key'"))
+        end
+    end
+
+    if dynamic
+        # dynamic, device-side kernel launch
+        error("unsupported")
+    else
+        # regular, host-side kernel launch
+        #
+        # convert the arguments, call the compiler and launch the kernel
+        # while keeping the original arguments alive
+        push!(code.args,
+            quote
+                GC.@preserve $(vars...) begin
+                    local kernel_args = cudaconvert.(($(var_exprs...),))
+                    local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
+                    local kernel = cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...))
+                    kernel(kernel_args...; $(map(esc, call_kwargs)...))
+                end
+             end)
+    end
+
     return code
 end
 
@@ -310,8 +333,8 @@ end
     end
 end
 
-# There doesn't seem to be a way to access the documentation for the call-syntax,
-# so attach it to the type
+# FIXME: there doesn't seem to be a way to access the documentation for the call-syntax,
+#        so attach it to the type
 """
     (::Kernel)(args...; kwargs...)
 
@@ -326,6 +349,7 @@ The following keyword arguments are supported:
 """
 Kernel
 
+
 ## other
 
 """

From 8509ce5b17b40221bc5803551cfbb4e74007ad95 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 Mar 2019 15:50:56 +0100
Subject: [PATCH 02/34] Update docs.

---
 src/execution.jl | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index ff6a3404..8dae43e7 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -167,13 +167,14 @@ a CUDA function upon first use, and to a certain extent arguments will be conver
 managed automatically using `cudaconvert`. Finally, a call to `CUDAdrv.cudacall` is
 performed, scheduling a kernel launch on the current CUDA context.
 
-Several keyword arguments are supported that influence kernel compilation and execution. For
-more information, refer to the documentation of respectively [`cufunction`](@ref) and
-[`CUDAnative.Kernel`](@ref)
+Several keyword arguments are supported that influence the behavior of `@cuda`.
+- `dynamic`: use dynamic parallelism to launch device-side kernels
+- arguments that influence kernel compilation: see [`cufunction`](@ref)
+- arguments that influence kernel execution: see [`CUDAnative.Kernel`](@ref)
 
 The underlying operations (argument conversion, kernel compilation, kernel call) can be
 performed explicitly when more control is needed, e.g. to reflect on the resource usage of a
-kernel to determine the launch configuration:
+kernel to determine the launch configuration. A host-side kernel launch is done as follows:
 
     args = ...
     GC.@preserve args begin
@@ -248,11 +249,11 @@ Low-level interface to compile a function invocation for the currently-active GP
 a callable kernel object. For a higher-level interface, use [`@cuda`](@ref).
 
 The following keyword arguments are supported:
-- minthreads: the required number of threads in a thread block.
-- maxthreads: the maximum number of threads in a thread block.
-- blocks_per_sm: a minimum number of thread blocks to be scheduled on a single
-  multiprocessor.
-- maxregs: the maximum number of registers to be allocated to a single thread (only
+- `minthreads`: the required number of threads in a thread block
+- `maxthreads`: the maximum number of threads in a thread block
+- `blocks_per_sm`: a minimum number of thread blocks to be scheduled on a single
+  multiprocessor
+- `maxregs`: the maximum number of registers to be allocated to a single thread (only
   supported on LLVM 4.0+)
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
@@ -342,10 +343,10 @@ Low-level interface to call a compiled kernel, passing GPU-compatible arguments
 For a higher-level interface, use [`@cuda`](@ref).
 
 The following keyword arguments are supported:
-- threads (defaults to 1)
-- blocks (defaults to 1)
-- shmem (defaults to 0)
-- stream (defaults to the default stream)
+- `threads` (defaults to 1)
+- `blocks` (defaults to 1)
+- `shmem` (defaults to 0)
+- `stream` (defaults to the default stream)
 """
 Kernel
 

From 3e238101c2573f53154d4852b15e6aca3c19a918 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 Mar 2019 18:08:53 +0100
Subject: [PATCH 03/34] Support for detecting dynamic calls and linking code.

---
 src/compiler/driver.jl          | 86 +++++++++++++++++++++++----------
 src/device/cuda/libcudadevrt.jl |  2 +-
 src/execution.jl                | 24 ++++++++-
 src/reflection.jl               |  3 +-
 test/codegen.jl                 |  6 +--
 5 files changed, 90 insertions(+), 31 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 89bb67f5..3273da8a 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -12,10 +12,16 @@ function module respectively of type `CuFuction` and `CuModule`.
 For a list of supported keyword arguments, refer to the documentation of
 [`cufunction`](@ref).
 """
-function compile(dev::CuDevice, @nospecialize(f::Core.Function), @nospecialize(tt); kwargs...)
+compile(dev::CuDevice, @nospecialize(f::Core.Function), @nospecialize(tt);
+        kernel=true, kwargs...) =
+    compile(CompilerContext(f, tt, supported_capability(dev), kernel; kwargs...))
+
+function compile(ctx::CompilerContext)
     CUDAnative.configured || error("CUDAnative.jl has not been configured; cannot JIT code.")
 
-    module_asm, module_entry = compile(supported_capability(dev), f, tt; kwargs...)
+    # generate code
+    ir, entry = codegen(ctx)
+    asm = mcgen(ctx, ir, entry)
 
     # enable debug options based on Julia's debug setting
     jit_options = Dict{CUDAdrv.CUjit_option,Any}()
@@ -25,27 +31,23 @@ function compile(dev::CuDevice, @nospecialize(f::Core.Function), @nospecialize(t
         jit_options[CUDAdrv.GENERATE_DEBUG_INFO] = true
     end
 
-    # Link libcudadevrt
+    # link the CUDA device library
     linker = CUDAdrv.CuLink(jit_options)
     CUDAdrv.add_file!(linker, libcudadevrt, CUDAdrv.LIBRARY)
-    CUDAdrv.add_data!(linker, module_entry, module_asm)
+    CUDAdrv.add_data!(linker, LLVM.name(entry), asm)
     image = CUDAdrv.complete(linker)
 
     cuda_mod = CuModule(image, jit_options)
-    cuda_fun = CuFunction(cuda_mod, module_entry)
+    cuda_fun = CuFunction(cuda_mod, LLVM.name(entry))
 
     return cuda_fun, cuda_mod
 end
 
-# same as above, but without an active device
-function compile(cap::VersionNumber, @nospecialize(f), @nospecialize(tt);
-                 kernel=true, kwargs...)
-    ctx = CompilerContext(f, tt, cap, kernel; kwargs...)
-
-    return compile(ctx)
-end
+codegen(cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt);
+        kernel=true, kwargs...) =
+    codegen(CompilerContext(f, tt, cap, kernel; kwargs...))
 
-function compile(ctx::CompilerContext)
+function codegen(ctx::CompilerContext)
     if compile_hook[] != nothing
         hook = compile_hook[]
         compile_hook[] = nothing
@@ -69,38 +71,72 @@ function compile(ctx::CompilerContext)
 
     ## low-level code generation (LLVM IR)
 
-    mod, entry = irgen(ctx)
+    ir, entry = irgen(ctx)
 
     need_library(lib) = any(f -> isdeclaration(f) &&
                                  intrinsic_id(f) == 0 &&
                                  haskey(functions(lib), LLVM.name(f)),
-                            functions(mod))
+                            functions(ir))
 
     libdevice = load_libdevice(ctx.cap)
     if need_library(libdevice)
-        link_libdevice!(ctx, mod, libdevice)
+        link_libdevice!(ctx, ir, libdevice)
     end
 
     # optimize the IR
-    entry = optimize!(ctx, mod, entry)
+    entry = optimize!(ctx, ir, entry)
 
     runtime = load_runtime(ctx.cap)
     if need_library(runtime)
-        link_library!(ctx, mod, runtime)
+        link_library!(ctx, ir, runtime)
     end
 
-    prepare_execution!(ctx, mod)
+    prepare_execution!(ctx, ir)
 
     check_invocation(ctx, entry)
 
-    # check generated IR
-    check_ir(ctx, mod)
-    verify(mod)
+
+    ## dynamic parallelism
+
+    # find dynamic kernel invocations
+    dyn_calls = []
+    if haskey(functions(ir), "cudanativeLaunchDevice")
+        f = functions(ir)["cudanativeLaunchDevice"]
+        for use in uses(f)
+            # decode the call
+            # FIXME: recover this earlier, from the Julia IR
+            call = user(use)::LLVM.CallInst
+            ops = collect(operands(call))[1:2]
+            ## addrspacecast
+            ops = LLVM.Value[first(operands(val)) for val in ops]
+            ## inttoptr
+            ops = ConstantInt[first(operands(val)) for val in ops]
+            ## integer constants
+            ops = convert.(Int, ops)
+            ## actual pointer values
+            ops = Ptr{Any}.(ops)
+
+            dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops)
+            push!(dyn_calls, (call, dyn_f, dyn_tt))
+        end
+    end
+
+    # compile and link
+    for (call, dyn_f, dyn_tt) in dyn_calls
+        dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true)
+        dyn_ir, dyn_entry = codegen(dyn_ctx)
+        link_library!(ctx, ir, dyn_ir)
+
+        # TODO
+        unsafe_delete!(LLVM.parent(call), call)
+    end
 
 
-    ## machine code generation (PTX assembly)
+    ## finalization
 
-    module_asm = mcgen(ctx, mod, entry)
+    # check generated IR
+    check_ir(ctx, ir)
+    verify(ir)
 
-    return module_asm, LLVM.name(entry)
+    return ir, entry
 end
diff --git a/src/device/cuda/libcudadevrt.jl b/src/device/cuda/libcudadevrt.jl
index 861d7d9b..39faf3b2 100644
--- a/src/device/cuda/libcudadevrt.jl
+++ b/src/device/cuda/libcudadevrt.jl
@@ -13,4 +13,4 @@ and should not be called from the host.
 `synchronize` acts as a synchronization point for
 child grids in the context of dynamic parallelism.
 """
-@inline synchronize() = @wrap cudaDeviceSynchronize()::i32
+@inline synchronize() = ccall("extern cudaDeviceSynchronize", llvmcall, Cint, ())
diff --git a/src/execution.jl b/src/execution.jl
index 8dae43e7..c2dab477 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -216,7 +216,16 @@ macro cuda(ex...)
 
     if dynamic
         # dynamic, device-side kernel launch
-        error("unsupported")
+        #
+        # WIP
+        # TODO: GC.@preserve?
+        # TODO: error on, or support kwargs
+        kernel_args = var_exprs # already in kernel land, so don't need a conversion
+        push!(code.args,
+            quote
+                local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)}
+                dynamic_cufunction($(esc(f)), kernel_tt)
+             end)
     else
         # regular, host-side kernel launch
         #
@@ -236,6 +245,19 @@ macro cuda(ex...)
     return code
 end
 
+@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
+    tt = Base.to_tuple_type(tt.parameters[1])
+    sig = Base.signature_type(f, tt)
+    t = Tuple(tt.parameters)
+    # TODO: closures
+
+    quote
+        # drop the f and tt into the module, and recover them later during compilation
+        ccall("extern cudanativeLaunchDevice", llvmcall, Nothing, (Any, Any), f, tt)
+        nothing
+    end
+end
+
 
 ## APIs for manual compilation
 
diff --git a/src/reflection.jl b/src/reflection.jl
index c8760ff7..fc5bb388 100644
--- a/src/reflection.jl
+++ b/src/reflection.jl
@@ -112,7 +112,8 @@ function code_sass(io::IO, ctx::CompilerContext)
         error("Your CUDA installation does not provide ptxas or nvdisasm, both of which are required for code_sass")
     end
 
-    ptx,_ = compile(ctx)
+    ir, entry = codegen(ctx)
+    ptx = mcgen(ctx, ir, entry)
 
     fn = tempname()
     gpu = "sm_$(ctx.cap.major)$(ctx.cap.minor)"
diff --git a/test/codegen.jl b/test/codegen.jl
index 89d44e71..68eada07 100644
--- a/test/codegen.jl
+++ b/test/codegen.jl
@@ -461,7 +461,7 @@ end
 @testset "non-isbits arguments" begin
     foobar(i) = (sink(unsafe_trunc(Int,i)); return)
 
-    @test_throws_message(CUDAnative.KernelError, CUDAnative.compile(v"3.5", foobar, Tuple{BigInt})) do msg
+    @test_throws_message(CUDAnative.KernelError, CUDAnative.codegen(v"3.5", foobar, Tuple{BigInt})) do msg
         occursin("passing and using non-bitstype argument", msg) &&
         occursin("BigInt", msg)
     end
@@ -470,7 +470,7 @@ end
 @testset "invalid LLVM IR" begin
     foobar(i) = println(i)
 
-    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Int})) do msg
+    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.codegen(v"3.5", foobar, Tuple{Int})) do msg
         occursin("invalid LLVM IR", msg) &&
         occursin(CUDAnative.RUNTIME_FUNCTION, msg) &&
         occursin("[1] println", msg) &&
@@ -481,7 +481,7 @@ end
 @testset "invalid LLVM IR (ccall)" begin
     foobar(p) = (unsafe_store!(p, ccall(:time, Cint, ())); nothing)
 
-    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Ptr{Int}})) do msg
+    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.codegen(v"3.5", foobar, Tuple{Ptr{Int}})) do msg
         occursin("invalid LLVM IR", msg) &&
         occursin(CUDAnative.POINTER_FUNCTION, msg) &&
         occursin(r"\[1\] .+foobar", msg)

From 56b59e388a8ff32cce8693ce86e85add1a70d77a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 Mar 2019 18:22:41 +0100
Subject: [PATCH 04/34] Make codegen() support invalid code.

---
 src/compiler/driver.jl | 20 +++++++++-----------
 src/execution.jl       |  4 +++-
 test/codegen.jl        |  6 +++---
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 3273da8a..2abf7a9a 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -4,23 +4,27 @@
 const compile_hook = Ref{Union{Nothing,Function}}(nothing)
 
 """
-    compile(dev::CuDevice, f, tt; kwargs...)
+    compile(cap::VersionNumber, f, tt; kernel=true, kwargs...)
 
-Compile a function `f` invoked with types `tt` for device `dev`, returning the compiled
-function module respectively of type `CuFuction` and `CuModule`.
+Compile a function `f` invoked with types `tt` for device `dev` or its compute capability
+`cap`, returning the compiled function module respectively of type `CuFuction` and
+`CuModule`.
 
 For a list of supported keyword arguments, refer to the documentation of
 [`cufunction`](@ref).
 """
-compile(dev::CuDevice, @nospecialize(f::Core.Function), @nospecialize(tt);
+compile(cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt);
         kernel=true, kwargs...) =
-    compile(CompilerContext(f, tt, supported_capability(dev), kernel; kwargs...))
+    compile(CompilerContext(f, tt, cap, kernel; kwargs...))
 
 function compile(ctx::CompilerContext)
     CUDAnative.configured || error("CUDAnative.jl has not been configured; cannot JIT code.")
 
     # generate code
     ir, entry = codegen(ctx)
+    check_invocation(ctx, entry)
+    check_ir(ctx, ir)
+    verify(ir)
     asm = mcgen(ctx, ir, entry)
 
     # enable debug options based on Julia's debug setting
@@ -93,8 +97,6 @@ function codegen(ctx::CompilerContext)
 
     prepare_execution!(ctx, ir)
 
-    check_invocation(ctx, entry)
-
 
     ## dynamic parallelism
 
@@ -134,9 +136,5 @@ function codegen(ctx::CompilerContext)
 
     ## finalization
 
-    # check generated IR
-    check_ir(ctx, ir)
-    verify(ir)
-
     return ir, entry
 end
diff --git a/src/execution.jl b/src/execution.jl
index c2dab477..3609e594 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -312,7 +312,9 @@ when function changes, or when different types or keyword arguments are provided
             key = hash(getfield(f, nf), key)
         end
         if !haskey(compilecache, key)
-            fun, mod = compile(device(ctx), f, tt; kwargs...)
+            dev = device(ctx)
+            cap = supported_capability(dev)
+            fun, mod = compile(cap, f, tt; kwargs...)
             kernel = Kernel{f,tt}(ctx, mod, fun)
             @debug begin
                 ver = version(kernel)
diff --git a/test/codegen.jl b/test/codegen.jl
index 68eada07..89d44e71 100644
--- a/test/codegen.jl
+++ b/test/codegen.jl
@@ -461,7 +461,7 @@ end
 @testset "non-isbits arguments" begin
     foobar(i) = (sink(unsafe_trunc(Int,i)); return)
 
-    @test_throws_message(CUDAnative.KernelError, CUDAnative.codegen(v"3.5", foobar, Tuple{BigInt})) do msg
+    @test_throws_message(CUDAnative.KernelError, CUDAnative.compile(v"3.5", foobar, Tuple{BigInt})) do msg
         occursin("passing and using non-bitstype argument", msg) &&
         occursin("BigInt", msg)
     end
@@ -470,7 +470,7 @@ end
 @testset "invalid LLVM IR" begin
     foobar(i) = println(i)
 
-    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.codegen(v"3.5", foobar, Tuple{Int})) do msg
+    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Int})) do msg
         occursin("invalid LLVM IR", msg) &&
         occursin(CUDAnative.RUNTIME_FUNCTION, msg) &&
         occursin("[1] println", msg) &&
@@ -481,7 +481,7 @@ end
 @testset "invalid LLVM IR (ccall)" begin
     foobar(p) = (unsafe_store!(p, ccall(:time, Cint, ())); nothing)
 
-    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.codegen(v"3.5", foobar, Tuple{Ptr{Int}})) do msg
+    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Ptr{Int}})) do msg
         occursin("invalid LLVM IR", msg) &&
         occursin(CUDAnative.POINTER_FUNCTION, msg) &&
         occursin(r"\[1\] .+foobar", msg)

From d43a3b143f7afc30b572344abbd5a3509988d549 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 Mar 2019 20:34:42 +0100
Subject: [PATCH 05/34] Restructure the main compiler entry-point for better
 reuse.

---
 src/compiler/driver.jl | 152 +++++++++++++++++++++++------------------
 src/execution.jl       |   2 +-
 src/reflection.jl      |  25 ++-----
 test/codegen.jl        |   6 +-
 4 files changed, 95 insertions(+), 90 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 2abf7a9a..ef410aad 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -4,76 +4,55 @@
 const compile_hook = Ref{Union{Nothing,Function}}(nothing)
 
 """
-    compile(cap::VersionNumber, f, tt; kernel=true, kwargs...)
+    compile(to::Symbol, cap::VersionNumber, f, tt;
+            kernel=true, optimize=true, strip=false, hooked=false,
+            kwargs...)
 
-Compile a function `f` invoked with types `tt` for device `dev` or its compute capability
-`cap`, returning the compiled function module respectively of type `CuFuction` and
-`CuModule`.
+Compile a function `f` invoked with types `tt` for device capability `cap` to one of the
+following formats as specified by the `to` argument: `:julia` for Julia IR, `:llvm` for LLVM
+IR, `:ptx` for PTX assembly and `:cuda` for CUDA driver objects.
 
-For a list of supported keyword arguments, refer to the documentation of
-[`cufunction`](@ref).
-"""
-compile(cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt);
-        kernel=true, kwargs...) =
-    compile(CompilerContext(f, tt, cap, kernel; kwargs...))
-
-function compile(ctx::CompilerContext)
-    CUDAnative.configured || error("CUDAnative.jl has not been configured; cannot JIT code.")
-
-    # generate code
-    ir, entry = codegen(ctx)
-    check_invocation(ctx, entry)
-    check_ir(ctx, ir)
-    verify(ir)
-    asm = mcgen(ctx, ir, entry)
-
-    # enable debug options based on Julia's debug setting
-    jit_options = Dict{CUDAdrv.CUjit_option,Any}()
-    if Base.JLOptions().debug_level == 1
-        jit_options[CUDAdrv.GENERATE_LINE_INFO] = true
-    elseif Base.JLOptions().debug_level >= 2
-        jit_options[CUDAdrv.GENERATE_DEBUG_INFO] = true
-    end
+The following keyword arguments are supported:
+- `kernel`: enable kernel-specific code generation
+- `optimize`: optimize the code
+- `strip`: strip non-functional metadata and debug information
 
-    # link the CUDA device library
-    linker = CUDAdrv.CuLink(jit_options)
-    CUDAdrv.add_file!(linker, libcudadevrt, CUDAdrv.LIBRARY)
-    CUDAdrv.add_data!(linker, LLVM.name(entry), asm)
-    image = CUDAdrv.complete(linker)
-
-    cuda_mod = CuModule(image, jit_options)
-    cuda_fun = CuFunction(cuda_mod, LLVM.name(entry))
-
-    return cuda_fun, cuda_mod
-end
-
-codegen(cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt);
-        kernel=true, kwargs...) =
-    codegen(CompilerContext(f, tt, cap, kernel; kwargs...))
-
-function codegen(ctx::CompilerContext)
-    if compile_hook[] != nothing
-        hook = compile_hook[]
-        compile_hook[] = nothing
-
-        global globalUnique
-        previous_globalUnique = globalUnique
-
-        hook(ctx)
-
-        globalUnique = previous_globalUnique
-        compile_hook[] = hook
+Other keyword arguments can be found in the documentation of [`cufunction`](@ref).
+"""
+compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt);
+        kernel::Bool=true, optimize::Bool=true, strip::Bool=false, hooked::Bool=false,
+        kwargs...) =
+    compile(to, CompilerContext(f, tt, cap, kernel; kwargs...);
+            optimize=optimize, strip=strip, hooked=hooked)
+
+function compile(to::Symbol, ctx::CompilerContext;
+                 optimize::Bool=true, strip::Bool=false, hooked::Bool=false)
+    if !hooked
+        @debug "(Re)compiling function" ctx
+        if compile_hook[] != nothing
+            hook = compile_hook[]
+            compile_hook[] = nothing
+
+            global globalUnique
+            previous_globalUnique = globalUnique
+
+            hook(ctx)
+
+            globalUnique = previous_globalUnique
+            compile_hook[] = hook
+        end
     end
 
 
-    ## high-level code generation (Julia AST)
-
-    @debug "(Re)compiling function" ctx
+    ## Julia IR
 
     check_method(ctx)
 
+    # TODO: get the method here, don't put it in the context?
+    #to == :julia && return asm
 
-    ## low-level code generation (LLVM IR)
+
+    ## LLVM IR
 
     ir, entry = irgen(ctx)
 
@@ -88,14 +67,20 @@ function codegen(ctx::CompilerContext)
     end
 
     # optimize the IR
-    entry = optimize!(ctx, ir, entry)
+    if optimize
+        entry = optimize!(ctx, ir, entry)
+    end
 
     runtime = load_runtime(ctx.cap)
     if need_library(runtime)
         link_library!(ctx, ir, runtime)
     end
 
-    prepare_execution!(ctx, ir)
+    verify(ir)
+
+    if strip
+        strip_debuginfo!(ir)
+    end
 
 
     ## dynamic parallelism
@@ -126,15 +111,50 @@ function codegen(ctx::CompilerContext)
     # compile and link
     for (call, dyn_f, dyn_tt) in dyn_calls
         dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true)
-        dyn_ir, dyn_entry = codegen(dyn_ctx)
-        link_library!(ctx, ir, dyn_ir)
+        dyn_ir, dyn_entry =
+            compile(:llvm, dyn_ctx; optimize=optimize, strip=strip, hooked=hooked)
+        link!(ir, dyn_ir)
 
         # TODO
         unsafe_delete!(LLVM.parent(call), call)
     end
 
+    to == :llvm && return ir, entry
+
+
+    ## PTX machine code
+
+    prepare_execution!(ctx, ir)
+
+    check_invocation(ctx, entry)
+    check_ir(ctx, ir)
+
+    asm = mcgen(ctx, ir, entry)
+
+    to == :ptx && return asm, LLVM.name(entry)
+
+
+    ## CUDA objects
+
+    # enable debug options based on Julia's debug setting
+    jit_options = Dict{CUDAdrv.CUjit_option,Any}()
+    if Base.JLOptions().debug_level == 1
+        jit_options[CUDAdrv.GENERATE_LINE_INFO] = true
+    elseif Base.JLOptions().debug_level >= 2
+        jit_options[CUDAdrv.GENERATE_DEBUG_INFO] = true
+    end
+
+    # link the CUDA device library
+    linker = CUDAdrv.CuLink(jit_options)
+    CUDAdrv.add_file!(linker, libcudadevrt, CUDAdrv.LIBRARY)
+    CUDAdrv.add_data!(linker, LLVM.name(entry), asm)
+    image = CUDAdrv.complete(linker)
+
+    cuda_mod = CuModule(image, jit_options)
+    cuda_fun = CuFunction(cuda_mod, LLVM.name(entry))
+
+    to == :cuda && return cuda_fun, cuda_mod
 
-    ## finalization
 
-    return ir, entry
+    error("Unknown compilation target $to")
 end
diff --git a/src/execution.jl b/src/execution.jl
index 3609e594..0f96e355 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -314,7 +314,7 @@ when function changes, or when different types or keyword arguments are provided
         if !haskey(compilecache, key)
             dev = device(ctx)
             cap = supported_capability(dev)
-            fun, mod = compile(cap, f, tt; kwargs...)
+            fun, mod = compile(:cuda, cap, f, tt; kwargs...)
             kernel = Kernel{f,tt}(ctx, mod, fun)
             @debug begin
                 ver = version(kernel)
diff --git a/src/reflection.jl b/src/reflection.jl
index fc5bb388..5aa543b7 100644
--- a/src/reflection.jl
+++ b/src/reflection.jl
@@ -37,16 +37,9 @@ function code_llvm(io::IO, @nospecialize(func::Core.Function), @nospecialize(typ
 end
 function code_llvm(io::IO, ctx::CompilerContext; optimize::Bool=true,
                    dump_module::Bool=false, strip_ir_metadata::Bool=true)
-    check_method(ctx)
-    mod, entry = irgen(ctx)
-    if optimize
-        entry = optimize!(ctx, mod, entry)
-    end
-    if strip_ir_metadata
-        strip_debuginfo!(mod)
-    end
+    ir, entry = compile(:llvm, ctx; optimize=optimize, strip=strip_ir_metadata)
     if dump_module
-        show(io, mod)
+        show(io, ir)
     else
         show(io, entry)
     end
@@ -74,15 +67,8 @@ function code_ptx(io::IO, @nospecialize(func::Core.Function), @nospecialize(type
     code_ptx(io, ctx; strip_ir_metadata=strip_ir_metadata)
 end
 function code_ptx(io::IO, ctx::CompilerContext; strip_ir_metadata::Bool=true)
-    check_method(ctx)
-    mod, entry = irgen(ctx)
-    entry = optimize!(ctx, mod, entry)
-    if strip_ir_metadata
-        strip_debuginfo!(mod)
-    end
-    prepare_execution!(ctx, mod)
-    ptx = mcgen(ctx, mod, entry)
-    print(io, ptx)
+    asm, _ = compile(:ptx, ctx; strip=strip_ir_metadata)
+    print(io, asm)
 end
 code_ptx(@nospecialize(func), @nospecialize(types); kwargs...) =
     code_ptx(stdout, func, types; kwargs...)
@@ -112,8 +98,7 @@ function code_sass(io::IO, ctx::CompilerContext)
         error("Your CUDA installation does not provide ptxas or nvdisasm, both of which are required for code_sass")
     end
 
-    ir, entry = codegen(ctx)
-    ptx = mcgen(ctx, ir, entry)
+    ptx, _ = compile(:ptx, ctx)
 
     fn = tempname()
     gpu = "sm_$(ctx.cap.major)$(ctx.cap.minor)"
diff --git a/test/codegen.jl b/test/codegen.jl
index 89d44e71..23874ecf 100644
--- a/test/codegen.jl
+++ b/test/codegen.jl
@@ -461,7 +461,7 @@ end
 @testset "non-isbits arguments" begin
     foobar(i) = (sink(unsafe_trunc(Int,i)); return)
 
-    @test_throws_message(CUDAnative.KernelError, CUDAnative.compile(v"3.5", foobar, Tuple{BigInt})) do msg
+    @test_throws_message(CUDAnative.KernelError, CUDAnative.compile(:ptx, v"3.5", foobar, Tuple{BigInt})) do msg
         occursin("passing and using non-bitstype argument", msg) &&
         occursin("BigInt", msg)
     end
@@ -470,7 +470,7 @@ end
 @testset "invalid LLVM IR" begin
     foobar(i) = println(i)
 
-    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Int})) do msg
+    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(:ptx, v"3.5", foobar, Tuple{Int})) do msg
         occursin("invalid LLVM IR", msg) &&
         occursin(CUDAnative.RUNTIME_FUNCTION, msg) &&
         occursin("[1] println", msg) &&
@@ -481,7 +481,7 @@ end
 @testset "invalid LLVM IR (ccall)" begin
     foobar(p) = (unsafe_store!(p, ccall(:time, Cint, ())); nothing)
 
-    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(v"3.5", foobar, Tuple{Ptr{Int}})) do msg
+    @test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(:ptx, v"3.5", foobar, Tuple{Ptr{Int}})) do msg
         occursin("invalid LLVM IR", msg) &&
         occursin(CUDAnative.POINTER_FUNCTION, msg) &&
         occursin(r"\[1\] .+foobar", msg)

From d787f70d757a042da36e9171895dcd5e165bb427 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 19 Mar 2019 22:16:17 +0100
Subject: [PATCH 06/34] Don't double-report dynamic kernel compilations.

---
 src/compiler/driver.jl | 65 ++++++++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 28 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index ef410aad..344f15cb 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -5,8 +5,7 @@ const compile_hook = Ref{Union{Nothing,Function}}(nothing)
 
 """
     compile(to::Symbol, cap::VersionNumber, f, tt;
-            kernel=true, optimize=true, strip=false, hooked=false,
-            kwargs...)
+            kernel=true, optimize=true, strip=false, ...)
 
 Compile a function `f` invoked with types `tt` for device capability `cap` to one of the
 following formats as specified by the `to` argument: `:julia` for Julia IR, `:llvm` for LLVM
@@ -20,27 +19,25 @@ The following keyword arguments are supported:
 Other keyword arguments can be found in the documentation of [`cufunction`](@ref).
 """
 compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt);
-        kernel::Bool=true, optimize::Bool=true, strip::Bool=false, hooked::Bool=false,
-        kwargs...) =
+        kernel::Bool=true, optimize::Bool=true, strip::Bool=false, kwargs...) =
     compile(to, CompilerContext(f, tt, cap, kernel; kwargs...);
-            optimize=optimize, strip=strip, hooked=hooked)
+            optimize=optimize, strip=strip)
 
 function compile(to::Symbol, ctx::CompilerContext;
-                 optimize::Bool=true, strip::Bool=false, hooked::Bool=false)
-    if !hooked
-        @debug "(Re)compiling function" ctx
-        if compile_hook[] != nothing
-            hook = compile_hook[]
-            compile_hook[] = nothing
+                 optimize::Bool=true, strip::Bool=false)
+    @debug "(Re)compiling function" ctx
 
-            global globalUnique
-            previous_globalUnique = globalUnique
+    if compile_hook[] != nothing
+        hook = compile_hook[]
+        compile_hook[] = nothing
 
-            hook(ctx)
+        global globalUnique
+        previous_globalUnique = globalUnique
 
-            globalUnique = previous_globalUnique
-            compile_hook[] = hook
-        end
+        hook(ctx)
+
+        globalUnique = previous_globalUnique
+        compile_hook[] = hook
     end
 
 
@@ -85,9 +82,10 @@ function compile(to::Symbol, ctx::CompilerContext;
 
     ## dynamic parallelism
 
-    # find dynamic kernel invocations
-    dyn_calls = []
     if haskey(functions(ir), "cudanativeLaunchDevice")
+        dyn_calls = []
+
+        # find dynamic kernel invocations
         f = functions(ir)["cudanativeLaunchDevice"]
         for use in uses(f)
             # decode the call
@@ -106,17 +104,28 @@ function compile(to::Symbol, ctx::CompilerContext;
             dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops)
             push!(dyn_calls, (call, dyn_f, dyn_tt))
         end
-    end
 
-    # compile and link
-    for (call, dyn_f, dyn_tt) in dyn_calls
-        dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true)
-        dyn_ir, dyn_entry =
-            compile(:llvm, dyn_ctx; optimize=optimize, strip=strip, hooked=hooked)
-        link!(ir, dyn_ir)
+        # compile and link
+        for (call, dyn_f, dyn_tt) in dyn_calls
+            # disable the compile hook; this recursive compilation call
+            # shouldn't be traced separately
+            hook = compile_hook[]
+            compile_hook[] = nothing
+
+            dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true)
+            dyn_ir, dyn_entry =
+                compile(:llvm, dyn_ctx; optimize=optimize, strip=strip)
+
+            compile_hook[] = hook
+
+            link!(ir, dyn_ir)
+
+            # TODO
+            unsafe_delete!(LLVM.parent(call), call)
+        end
 
-        # TODO
-        unsafe_delete!(LLVM.parent(call), call)
+        @compiler_assert isempty(uses(f)) ctx
+        unsafe_delete!(ir, f)
     end
 
     to == :llvm && return ir, entry

From f20580234e67f3ed59ae75180700a1f0b62e7ff6 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Mar 2019 09:05:20 +0100
Subject: [PATCH 07/34] Improve hook handline, and reuse compiler driver for
 rtlib generation.

---
 src/compiler/driver.jl | 37 ++++++++++++++-----------------------
 src/compiler/rtlib.jl  |  5 +----
 src/reflection.jl      |  6 +++---
 3 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 344f15cb..1fa8bab6 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -4,40 +4,38 @@
 const compile_hook = Ref{Union{Nothing,Function}}(nothing)
 
 """
-    compile(to::Symbol, cap::VersionNumber, f, tt;
+    compile(to::Symbol, cap::VersionNumber, f, tt, kernel=true;
             kernel=true, optimize=true, strip=false, ...)
 
 Compile a function `f` invoked with types `tt` for device capability `cap` to one of the
 following formats as specified by the `to` argument: `:julia` for Julia IR, `:llvm` for LLVM
-IR, `:ptx` for PTX assembly and `:cuda` for CUDA driver objects.
+IR, `:ptx` for PTX assembly and `:cuda` for CUDA driver objects. If the `kernel` flag is
+set, specialized code generation and optimization for kernel functions is enabled.
 
 The following keyword arguments are supported:
-- `kernel`: enable kernel-specific code generation
-- `optimize`: optimize the code
-- `strip`: strip non-functional metadata and debug information
+- `hooks`: enable compiler hooks that drive reflection functions (default: true)
+- `optimize`: optimize the code (default: true)
+- `strip`: strip non-functional metadata and debug information  (default: false)
 
 Other keyword arguments can be found in the documentation of [`cufunction`](@ref).
 """
-compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt);
-        kernel::Bool=true, optimize::Bool=true, strip::Bool=false, kwargs...) =
+compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt),
+        kernel::Bool=true; hooks::Bool=true, optimize::Bool=true, strip::Bool=false,
+        kwargs...) =
     compile(to, CompilerContext(f, tt, cap, kernel; kwargs...);
-            optimize=optimize, strip=strip)
+            hooks=hooks, optimize=optimize, strip=strip)
 
 function compile(to::Symbol, ctx::CompilerContext;
-                 optimize::Bool=true, strip::Bool=false)
+                 hooks::Bool=true, optimize::Bool=true, strip::Bool=false)
     @debug "(Re)compiling function" ctx
 
-    if compile_hook[] != nothing
-        hook = compile_hook[]
-        compile_hook[] = nothing
-
+    if hooks && compile_hook[] != nothing
         global globalUnique
         previous_globalUnique = globalUnique
 
-        hook(ctx)
+        compile_hook[](ctx)
 
         globalUnique = previous_globalUnique
-        compile_hook[] = hook
     end
 
 
@@ -107,16 +105,9 @@ function compile(to::Symbol, ctx::CompilerContext;
 
         # compile and link
         for (call, dyn_f, dyn_tt) in dyn_calls
-            # disable the compile hook; this recursive compilation call
-            # shouldn't be traced separately
-            hook = compile_hook[]
-            compile_hook[] = nothing
-
             dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true)
             dyn_ir, dyn_entry =
-                compile(:llvm, dyn_ctx; optimize=optimize, strip=strip)
-
-            compile_hook[] = hook
+                compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip)
 
             link!(ir, dyn_ir)
 
diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl
index 81abb66d..58b6c9a5 100644
--- a/src/compiler/rtlib.jl
+++ b/src/compiler/rtlib.jl
@@ -124,11 +124,8 @@ end
 
 function emit_function!(mod, cap, f, types, name)
     tt = Base.to_tuple_type(types)
-    ctx = CompilerContext(f, tt, cap, #= kernel =# false)
-    new_mod, entry = irgen(ctx)
-    entry = optimize!(ctx, new_mod, entry)
+    new_mod, entry = compile(:llvm, cap, f, tt, #=kernel=# false; hooks=false)
     LLVM.name!(entry, name)
-
     link!(mod, new_mod)
 end
 
diff --git a/src/reflection.jl b/src/reflection.jl
index 5aa543b7..1b23388f 100644
--- a/src/reflection.jl
+++ b/src/reflection.jl
@@ -37,7 +37,7 @@ function code_llvm(io::IO, @nospecialize(func::Core.Function), @nospecialize(typ
 end
 function code_llvm(io::IO, ctx::CompilerContext; optimize::Bool=true,
                    dump_module::Bool=false, strip_ir_metadata::Bool=true)
-    ir, entry = compile(:llvm, ctx; optimize=optimize, strip=strip_ir_metadata)
+    ir, entry = compile(:llvm, ctx; hooks=false, optimize=optimize, strip=strip_ir_metadata)
     if dump_module
         show(io, ir)
     else
@@ -67,7 +67,7 @@ function code_ptx(io::IO, @nospecialize(func::Core.Function), @nospecialize(type
     code_ptx(io, ctx; strip_ir_metadata=strip_ir_metadata)
 end
 function code_ptx(io::IO, ctx::CompilerContext; strip_ir_metadata::Bool=true)
-    asm, _ = compile(:ptx, ctx; strip=strip_ir_metadata)
+    asm, _ = compile(:ptx, ctx; hooks=false, strip=strip_ir_metadata)
     print(io, asm)
 end
 code_ptx(@nospecialize(func), @nospecialize(types); kwargs...) =
@@ -98,7 +98,7 @@ function code_sass(io::IO, ctx::CompilerContext)
         error("Your CUDA installation does not provide ptxas or nvdisasm, both of which are required for code_sass")
     end
 
-    ptx, _ = compile(:ptx, ctx)
+    ptx, _ = compile(:ptx, ctx; hooks=false)
 
     fn = tempname()
     gpu = "sm_$(ctx.cap.major)$(ctx.cap.minor)"

From 6ab86a2faed3b4f0eb07901aeae40e1b7747cfef Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Mar 2019 09:15:57 +0100
Subject: [PATCH 08/34] Move Julia irgen to compiler driver.

---
 src/compiler/driver.jl     | 19 ++++++++++++++++---
 src/compiler/irgen.jl      | 19 ++-----------------
 src/compiler/validation.jl |  2 ++
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 1fa8bab6..75a2514e 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -43,13 +43,26 @@ function compile(to::Symbol, ctx::CompilerContext;
 
     check_method(ctx)
 
-    # TODO: get the method here, don't put it in the context?
-    #to == :julia && return asm
+    # get the method instance
+    world = typemax(UInt)
+    meth = which(ctx.f, ctx.tt)
+    sig = Base.signature_type(ctx.f, ctx.tt)::Type
+    (ti, env) = ccall(:jl_type_intersection_with_env, Any,
+                      (Any, Any), sig, meth.sig)::Core.SimpleVector
+    if VERSION >= v"1.2.0-DEV.320"
+        meth = Base.func_for_method_checked(meth, ti, env)
+    else
+        meth = Base.func_for_method_checked(meth, ti)
+    end
+    linfo = ccall(:jl_specializations_get_linfo, Ref{Core.MethodInstance},
+                  (Any, Any, Any, UInt), meth, ti, env, world)
+
+    to == :julia && return linfo
 
 
     ## LLVM IR
 
-    ir, entry = irgen(ctx)
+    ir, entry = irgen(ctx, linfo)
 
     need_library(lib) = any(f -> isdeclaration(f) &&
                                  intrinsic_id(f) == 0 &&
diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl
index 8cefe6d9..cea12d3f 100644
--- a/src/compiler/irgen.jl
+++ b/src/compiler/irgen.jl
@@ -1,4 +1,4 @@
-# Julia/LLVM IR generation and transformation passes
+# LLVM IR generation
 
 function module_setup(mod::LLVM.Module)
     triple!(mod, Int === Int64 ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda")
@@ -97,22 +97,7 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world)
     return llvmf, modules
 end
 
-function irgen(ctx::CompilerContext)
-    # get the method instance
-    isa(ctx.f, Core.Builtin) && throw(KernelError(ctx, "function is not a generic function"))
-    world = typemax(UInt)
-    meth = which(ctx.f, ctx.tt)
-    sig = Base.signature_type(ctx.f, ctx.tt)::Type
-    (ti, env) = ccall(:jl_type_intersection_with_env, Any,
-                      (Any, Any), sig, meth.sig)::Core.SimpleVector
-    if VERSION >= v"1.2.0-DEV.320"
-        meth = Base.func_for_method_checked(meth, ti, env)
-    else
-        meth = Base.func_for_method_checked(meth, ti)
-    end
-    linfo = ccall(:jl_specializations_get_linfo, Ref{Core.MethodInstance},
-                  (Any, Any, Any, UInt), meth, ti, env, world)
-
+function irgen(ctx::CompilerContext, linfo, world=typemax(UInt))
     entry, modules = compile_linfo(ctx, linfo, world)
 
     # link in dependent modules
diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl
index c69cacba..5fedbd79 100644
--- a/src/compiler/validation.jl
+++ b/src/compiler/validation.jl
@@ -1,6 +1,8 @@
 # validation of properties and code
 
 function check_method(ctx::CompilerContext)
+    isa(ctx.f, Core.Builtin) && throw(KernelError(ctx, "function is not a generic function"))
+
     # get the method
     ms = Base.methods(ctx.f, ctx.tt)
     isempty(ms)   && throw(KernelError(ctx, "no method found"))

From 2126b7400eb970e91a546de40212b1d93a786ca7 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Mar 2019 09:17:58 +0100
Subject: [PATCH 09/34] NFC changes.

---
 src/compiler/driver.jl | 12 ++++++------
 src/compiler/irgen.jl  |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 75a2514e..b38b14a8 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -62,7 +62,7 @@ function compile(to::Symbol, ctx::CompilerContext;
 
     ## LLVM IR
 
-    ir, entry = irgen(ctx, linfo)
+    ir, entry = irgen(ctx, linfo, world)
 
     need_library(lib) = any(f -> isdeclaration(f) &&
                                  intrinsic_id(f) == 0 &&
@@ -94,13 +94,13 @@ function compile(to::Symbol, ctx::CompilerContext;
     ## dynamic parallelism
 
     if haskey(functions(ir), "cudanativeLaunchDevice")
-        dyn_calls = []
+        f = functions(ir)["cudanativeLaunchDevice"]
 
         # find dynamic kernel invocations
-        f = functions(ir)["cudanativeLaunchDevice"]
+        # TODO: recover this information earlier, from the Julia IR
+        worklist = []
         for use in uses(f)
             # decode the call
-            # FIXME: recover this earlier, from the Julia IR
             call = user(use)::LLVM.CallInst
             ops = collect(operands(call))[1:2]
             ## addrspacecast
@@ -113,11 +113,11 @@ function compile(to::Symbol, ctx::CompilerContext;
             ops = Ptr{Any}.(ops)
 
             dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops)
-            push!(dyn_calls, (call, dyn_f, dyn_tt))
+            push!(worklist, (call, dyn_f, dyn_tt))
         end
 
         # compile and link
-        for (call, dyn_f, dyn_tt) in dyn_calls
+        for (call, dyn_f, dyn_tt) in worklist
             dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true)
             dyn_ir, dyn_entry =
                 compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip)
diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl
index cea12d3f..e9677092 100644
--- a/src/compiler/irgen.jl
+++ b/src/compiler/irgen.jl
@@ -97,7 +97,7 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world)
     return llvmf, modules
 end
 
-function irgen(ctx::CompilerContext, linfo, world=typemax(UInt))
+function irgen(ctx::CompilerContext, linfo::Core.MethodInstance, world)
     entry, modules = compile_linfo(ctx, linfo, world)
 
     # link in dependent modules

From f1dca4b92ee88a188d3b1209412862b3559306c8 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Mar 2019 10:44:52 +0100
Subject: [PATCH 10/34] Initial use of libcudadevrt for dynamic parallelism.

---
 src/compiler/driver.jl     |  9 ++++++---
 src/compiler/validation.jl | 10 ++++++++--
 src/execution.jl           | 31 +++++++++++++++++++++++++------
 3 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index b38b14a8..8ef39bd9 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -93,8 +93,8 @@ function compile(to::Symbol, ctx::CompilerContext;
 
     ## dynamic parallelism
 
-    if haskey(functions(ir), "cudanativeLaunchDevice")
-        f = functions(ir)["cudanativeLaunchDevice"]
+    if haskey(functions(ir), "cudanativeCompileKernel")
+        f = functions(ir)["cudanativeCompileKernel"]
 
         # find dynamic kernel invocations
         # TODO: recover this information earlier, from the Julia IR
@@ -122,9 +122,12 @@ function compile(to::Symbol, ctx::CompilerContext;
             dyn_ir, dyn_entry =
                 compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip)
 
+            dyn_fn = LLVM.name(dyn_entry)
             link!(ir, dyn_ir)
+            dyn_ir = nothing
+            dyn_entry = functions(ir)[dyn_fn]
 
-            # TODO
+            replace_uses!(call, dyn_entry)
             unsafe_delete!(LLVM.parent(call), call)
         end
 
diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl
index 5fedbd79..1c225f44 100644
--- a/src/compiler/validation.jl
+++ b/src/compiler/validation.jl
@@ -100,8 +100,14 @@ function check_ir!(ctx, errors::Vector{IRError}, f::LLVM.Function)
     return errors
 end
 
-const special_fns = ("vprintf", "__assertfail", "malloc", "free", "__nvvm_reflect",
-     #=device_rt=#   "cudaDeviceSynchronize")
+const special_fns = (
+    # PTX intrinsics
+    "vprintf", "__assertfail", "malloc", "free",
+    # libdevice
+    "__nvvm_reflect",
+    # libcudevrt
+    "cudaDeviceSynchronize", "cudaGetParameterBufferV2", "cudaLaunchDeviceV2"
+)
 
 const libjulia = Ref{Ptr{Cvoid}}(C_NULL)
 
diff --git a/src/execution.jl b/src/execution.jl
index 0f96e355..f9ebc4ac 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -61,7 +61,8 @@ end
 
 ## helper functions
 
-# split keyword arguments to `@cuda` into ones affecting the compiler, or the execution
+# split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and
+# the code it generates, or the execution
 function split_kwargs(kwargs)
     macro_kws    = [:dynamic]
     compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs]
@@ -220,11 +221,12 @@ macro cuda(ex...)
         # WIP
         # TODO: GC.@preserve?
         # TODO: error on, or support kwargs
-        kernel_args = var_exprs # already in kernel land, so don't need a conversion
         push!(code.args,
             quote
+                # we're in kernel land already, so no need to convert arguments
                 local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)}
-                dynamic_cufunction($(esc(f)), kernel_tt)
+                local kernel = dynamic_cufunction($(esc(f)), kernel_tt)
+                dynamic_launch(kernel, 1, 1, 0, C_NULL)
              end)
     else
         # regular, host-side kernel launch
@@ -236,7 +238,8 @@ macro cuda(ex...)
                 GC.@preserve $(vars...) begin
                     local kernel_args = cudaconvert.(($(var_exprs...),))
                     local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
-                    local kernel = cufunction($(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...))
+                    local kernel = cufunction($(esc(f)), kernel_tt;
+                                              $(map(esc, compiler_kwargs)...))
                     kernel(kernel_args...; $(map(esc, call_kwargs)...))
                 end
              end)
@@ -245,6 +248,23 @@ macro cuda(ex...)
     return code
 end
 
+import CUDAdrv: CuDim3
+
+const cudaError_t = Cint
+const cudaStream_t = Ptr{Cvoid}
+
+@inline function dynamic_launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim, shmem::Int, stream::Ptr{Cvoid})
+    blocks = CuDim3(blocks)
+    threads = CuDim3(threads)
+    buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
+                (Ptr{Cvoid}, CuDim3, CuDim3, Cuint),
+                f, blocks, threads, shmem)
+
+    ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
+          (Ptr{Cvoid}, cudaStream_t),
+          buf, stream)
+end
+
 @generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
     tt = Base.to_tuple_type(tt.parameters[1])
     sig = Base.signature_type(f, tt)
@@ -253,8 +273,7 @@ end
 
     quote
         # drop the f and tt into the module, and recover them later during compilation
-        ccall("extern cudanativeLaunchDevice", llvmcall, Nothing, (Any, Any), f, tt)
-        nothing
+        ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt)
     end
 end
 

From 6646a4b559281ba5358d099cafdd4d87fb83f304 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Mar 2019 14:07:11 +0100
Subject: [PATCH 11/34] Use simpler dynamic cufunction construction.

---
 src/execution.jl | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index f9ebc4ac..f83cd6b9 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -253,6 +253,9 @@ import CUDAdrv: CuDim3
 const cudaError_t = Cint
 const cudaStream_t = Ptr{Cvoid}
 
+dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) =
+    ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt)
+
 @inline function dynamic_launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim, shmem::Int, stream::Ptr{Cvoid})
     blocks = CuDim3(blocks)
     threads = CuDim3(threads)
@@ -265,18 +268,6 @@ const cudaStream_t = Ptr{Cvoid}
           buf, stream)
 end
 
-@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
-    tt = Base.to_tuple_type(tt.parameters[1])
-    sig = Base.signature_type(f, tt)
-    t = Tuple(tt.parameters)
-    # TODO: closures
-
-    quote
-        # drop the f and tt into the module, and recover them later during compilation
-        ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt)
-    end
-end
-
 
 ## APIs for manual compilation
 

From 3bb7eeaedcb25919794dedee38fa5fd2b89a15f2 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Mar 2019 14:07:59 +0100
Subject: [PATCH 12/34] Support call kwargs.

---
 src/execution.jl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index f83cd6b9..118eb0c6 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -226,7 +226,7 @@ macro cuda(ex...)
                 # we're in kernel land already, so no need to convert arguments
                 local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)}
                 local kernel = dynamic_cufunction($(esc(f)), kernel_tt)
-                dynamic_launch(kernel, 1, 1, 0, C_NULL)
+                dynamic_launch(kernel, $(var_exprs...); $(map(esc, call_kwargs)...))
              end)
     else
         # regular, host-side kernel launch
@@ -248,21 +248,25 @@ macro cuda(ex...)
     return code
 end
 
-import CUDAdrv: CuDim3
+import CUDAdrv: CuDim3, CuStream_t
 
 const cudaError_t = Cint
-const cudaStream_t = Ptr{Cvoid}
+const cudaStream_t = CUDAdrv.CuStream_t
 
 dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) =
     ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt)
 
-@inline function dynamic_launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim, shmem::Int, stream::Ptr{Cvoid})
+@inline function dynamic_launch(f::Ptr{Cvoid}, values::Vararg{Any,N};
+                                blocks::CuDim=1, threads::CuDim=1, shmem::Int=0,
+                                stream::CuStream=CuDefaultStream()) where N
     blocks = CuDim3(blocks)
     threads = CuDim3(threads)
     buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
                 (Ptr{Cvoid}, CuDim3, CuDim3, Cuint),
                 f, blocks, threads, shmem)
 
+    # TODO: store into buf
+
     ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
           (Ptr{Cvoid}, cudaStream_t),
           buf, stream)

From d94e96de87f42fd616fd340ea7c22bb62e1672cb Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Mar 2019 15:29:41 +0100
Subject: [PATCH 13/34] Initial support for arguments.

---
 src/execution.jl | 72 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 64 insertions(+), 8 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index 118eb0c6..8259d087 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -226,7 +226,7 @@ macro cuda(ex...)
                 # we're in kernel land already, so no need to convert arguments
                 local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)}
                 local kernel = dynamic_cufunction($(esc(f)), kernel_tt)
-                dynamic_launch(kernel, $(var_exprs...); $(map(esc, call_kwargs)...))
+                dynamic_cudacall(kernel, kernel_tt, $(var_exprs...); $(map(esc, call_kwargs)...))
              end)
     else
         # regular, host-side kernel launch
@@ -256,20 +256,76 @@ const cudaStream_t = CUDAdrv.CuStream_t
 dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) =
     ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt)
 
-@inline function dynamic_launch(f::Ptr{Cvoid}, values::Vararg{Any,N};
-                                blocks::CuDim=1, threads::CuDim=1, shmem::Int=0,
-                                stream::CuStream=CuDefaultStream()) where N
+@generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...;
+                                     blocks::CuDim=1, threads::CuDim=1, shmem::Integer=0,
+                                     stream::CuStream=CuDefaultStream())
+    ex = quote
+        Base.@_inline_meta
+    end
+
+    # convert the argument values to match the kernel's signature (specified by the user)
+    # (this mimics `lower-ccall` in julia-syntax.scm)
+    converted_args = Vector{Symbol}(undef, length(args))
+    arg_ptrs = Vector{Symbol}(undef, length(args))
+    for i in 1:length(args)
+        converted_args[i] = gensym()
+        arg_ptrs[i] = gensym()
+        push!(ex.args, :($(converted_args[i]) = Base.cconvert($(args[i]), args[$i])))
+        push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(args[i]), $(converted_args[i]))))
+    end
+
+    append!(ex.args, (quote
+        #GC.@preserve $(converted_args...) begin
+            launch(f, blocks, threads, shmem, stream, ($(arg_ptrs...),))
+        #end
+    end).args)
+
+    return ex
+end
+
+@inline function launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim,
+                        shmem::Int, stream::CuStream,
+                        args...)
     blocks = CuDim3(blocks)
     threads = CuDim3(threads)
-    buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
-                (Ptr{Cvoid}, CuDim3, CuDim3, Cuint),
-                f, blocks, threads, shmem)
 
-    # TODO: store into buf
+    buf = parameter_buffer(f, blocks, threads, shmem, args...)
 
     ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
           (Ptr{Cvoid}, cudaStream_t),
           buf, stream)
+
+    return
+end
+
+@generated function parameter_buffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3,
+                                     shmem::Int, args...)
+    # allocate a buffer
+    ex = quote
+        buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
+                    (Ptr{Cvoid}, CuDim3, CuDim3, Cuint),
+                    f, blocks, threads, shmem)
+    end
+
+    # store the parameters
+    #
+    # > Each individual parameter placed in the parameter buffer is required to be aligned.
+    # > That is, each parameter must be placed at the n-th byte in the parameter buffer,
+    # > where n is the smallest multiple of the parameter size that is greater than the
+    # > offset of the last byte taken by the preceding parameter. The maximum size of the
+    # > parameter buffer is 4KB.
+    offset = 0
+    for i in 1:length(args)
+        buf_index = Base.ceil(Int, offset / sizeof(args[i])) + 1
+        offset = buf_index * sizeof(args[i])
+        push!(ex.args, :(
+            unsafe_store!(Base.unsafe_convert(Ptr{$(args[i])}, buf), args[$i], $buf_index)
+        ))
+    end
+
+    push!(ex.args, :(return buf))
+
+    return ex
 end
 
 

From cd295380d212c83d8d9f4f48e5cf3397c24963e1 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Mar 2019 15:42:57 +0100
Subject: [PATCH 14/34] Move code around.

---
 src/device/cuda/libcudadevrt.jl |  51 ++++++++++++++
 src/execution.jl                | 117 +++++++++-----------------------
 2 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/src/device/cuda/libcudadevrt.jl b/src/device/cuda/libcudadevrt.jl
index 39faf3b2..59f7c969 100644
--- a/src/device/cuda/libcudadevrt.jl
+++ b/src/device/cuda/libcudadevrt.jl
@@ -4,6 +4,57 @@
 # the CUDA API for execution on the device, such as device synchronization primitives,
 # dynamic kernel APIs, etc.
 
+import CUDAdrv: CuDim3, CuStream_t
+
+const cudaError_t = Cint
+const cudaStream_t = CUDAdrv.CuStream_t
+
+# device-side counterpart of CUDAdrv.launch
+@inline function launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim,
+                        shmem::Int, stream::CuStream,
+                        args...)
+    blocks = CuDim3(blocks)
+    threads = CuDim3(threads)
+
+    buf = parameter_buffer(f, blocks, threads, shmem, args...)
+
+    ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
+          (Ptr{Cvoid}, cudaStream_t),
+          buf, stream)
+
+    return
+end
+
+@generated function parameter_buffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3,
+                                     shmem::Int, args...)
+    # allocate a buffer
+    ex = quote
+        buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
+                    (Ptr{Cvoid}, CuDim3, CuDim3, Cuint),
+                    f, blocks, threads, shmem)
+    end
+
+    # store the parameters
+    #
+    # > Each individual parameter placed in the parameter buffer is required to be aligned.
+    # > That is, each parameter must be placed at the n-th byte in the parameter buffer,
+    # > where n is the smallest multiple of the parameter size that is greater than the
+    # > offset of the last byte taken by the preceding parameter. The maximum size of the
+    # > parameter buffer is 4KB.
+    offset = 0
+    for i in 1:length(args)
+        buf_index = Base.ceil(Int, offset / sizeof(args[i])) + 1
+        offset = buf_index * sizeof(args[i])
+        push!(ex.args, :(
+            unsafe_store!(Base.unsafe_convert(Ptr{$(args[i])}, buf), args[$i], $buf_index)
+        ))
+    end
+
+    push!(ex.args, :(return buf))
+
+    return ex
+end
+
 """
     synchronize()
 
diff --git a/src/execution.jl b/src/execution.jl
index 8259d087..c16411bc 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -217,10 +217,6 @@ macro cuda(ex...)
 
     if dynamic
         # dynamic, device-side kernel launch
-        #
-        # WIP
-        # TODO: GC.@preserve?
-        # TODO: error on, or support kwargs
         push!(code.args,
             quote
                 # we're in kernel land already, so no need to convert arguments
@@ -248,86 +244,6 @@ macro cuda(ex...)
     return code
 end
 
-import CUDAdrv: CuDim3, CuStream_t
-
-const cudaError_t = Cint
-const cudaStream_t = CUDAdrv.CuStream_t
-
-dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) =
-    ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt)
-
-@generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...;
-                                     blocks::CuDim=1, threads::CuDim=1, shmem::Integer=0,
-                                     stream::CuStream=CuDefaultStream())
-    ex = quote
-        Base.@_inline_meta
-    end
-
-    # convert the argument values to match the kernel's signature (specified by the user)
-    # (this mimics `lower-ccall` in julia-syntax.scm)
-    converted_args = Vector{Symbol}(undef, length(args))
-    arg_ptrs = Vector{Symbol}(undef, length(args))
-    for i in 1:length(args)
-        converted_args[i] = gensym()
-        arg_ptrs[i] = gensym()
-        push!(ex.args, :($(converted_args[i]) = Base.cconvert($(args[i]), args[$i])))
-        push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(args[i]), $(converted_args[i]))))
-    end
-
-    append!(ex.args, (quote
-        #GC.@preserve $(converted_args...) begin
-            launch(f, blocks, threads, shmem, stream, ($(arg_ptrs...),))
-        #end
-    end).args)
-
-    return ex
-end
-
-@inline function launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim,
-                        shmem::Int, stream::CuStream,
-                        args...)
-    blocks = CuDim3(blocks)
-    threads = CuDim3(threads)
-
-    buf = parameter_buffer(f, blocks, threads, shmem, args...)
-
-    ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
-          (Ptr{Cvoid}, cudaStream_t),
-          buf, stream)
-
-    return
-end
-
-@generated function parameter_buffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3,
-                                     shmem::Int, args...)
-    # allocate a buffer
-    ex = quote
-        buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
-                    (Ptr{Cvoid}, CuDim3, CuDim3, Cuint),
-                    f, blocks, threads, shmem)
-    end
-
-    # store the parameters
-    #
-    # > Each individual parameter placed in the parameter buffer is required to be aligned.
-    # > That is, each parameter must be placed at the n-th byte in the parameter buffer,
-    # > where n is the smallest multiple of the parameter size that is greater than the
-    # > offset of the last byte taken by the preceding parameter. The maximum size of the
-    # > parameter buffer is 4KB.
-    offset = 0
-    for i in 1:length(args)
-        buf_index = Base.ceil(Int, offset / sizeof(args[i])) + 1
-        offset = buf_index * sizeof(args[i])
-        push!(ex.args, :(
-            unsafe_store!(Base.unsafe_convert(Ptr{$(args[i])}, buf), args[$i], $buf_index)
-        ))
-    end
-
-    push!(ex.args, :(return buf))
-
-    return ex
-end
-
 
 ## APIs for manual compilation
 
@@ -445,6 +361,39 @@ The following keyword arguments are supported:
 Kernel
 
 
+## dynamic parallelism
+
+dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) =
+    ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt)
+
+@generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...;
+                                     blocks::CuDim=1, threads::CuDim=1, shmem::Integer=0,
+                                     stream::CuStream=CuDefaultStream())
+    ex = quote
+        Base.@_inline_meta
+    end
+
+    # convert the argument values to match the kernel's signature (specified by the user)
+    # (this mimics `lower-ccall` in julia-syntax.scm)
+    converted_args = Vector{Symbol}(undef, length(args))
+    arg_ptrs = Vector{Symbol}(undef, length(args))
+    for i in 1:length(args)
+        converted_args[i] = gensym()
+        arg_ptrs[i] = gensym()
+        push!(ex.args, :($(converted_args[i]) = Base.cconvert($(args[i]), args[$i])))
+        push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(args[i]), $(converted_args[i]))))
+    end
+
+    append!(ex.args, (quote
+        #GC.@preserve $(converted_args...) begin
+            launch(f, blocks, threads, shmem, stream, ($(arg_ptrs...),))
+        #end
+    end).args)
+
+    return ex
+end
+
+
 ## other
 
 """

From e9de1bb2a37b8b308f3d5b77877d901cf496d827 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Mar 2019 15:56:06 +0100
Subject: [PATCH 15/34] Introduce a DynamicKernel object.

---
 src/execution.jl | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index c16411bc..5feafa5d 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -219,10 +219,10 @@ macro cuda(ex...)
         # dynamic, device-side kernel launch
         push!(code.args,
             quote
-                # we're in kernel land already, so no need to convert arguments
+                # we're in kernel land already, so no need to cudaconvert arguments
                 local kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)}
                 local kernel = dynamic_cufunction($(esc(f)), kernel_tt)
-                dynamic_cudacall(kernel, kernel_tt, $(var_exprs...); $(map(esc, call_kwargs)...))
+                kernel($(var_exprs...); $(map(esc, call_kwargs)...))
              end)
     else
         # regular, host-side kernel launch
@@ -363,8 +363,27 @@ Kernel
 
 ## dynamic parallelism
 
-dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) =
-    ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt)
+struct DynamicKernel{F,TT}
+    fun::Ptr{Cvoid}
+end
+
+function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
+    # we can't compile here, so drop a marker which will get picked up during compilation
+    fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt)
+    DynamicKernel{f,tt}(fptr)
+end
+
+@generated function (kernel::DynamicKernel{F,TT})(args...; call_kwargs...) where {F,TT}
+    # TODO
+    call_args = :(args)
+    call_tt = TT
+
+    quote
+        Base.@_inline_meta
+
+        dynamic_cudacall(kernel.fun, $call_tt, $call_args...; call_kwargs...)
+    end
+end
 
 @generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...;
                                      blocks::CuDim=1, threads::CuDim=1, shmem::Integer=0,

From 4c1b12d94c711e57d6d85e7b4c95915ef56a6fee Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Mar 2019 16:14:26 +0100
Subject: [PATCH 16/34] Ghost argument filtering, etc.

---
 src/execution.jl | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index 5feafa5d..e62e10c6 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -373,34 +373,54 @@ function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
     DynamicKernel{f,tt}(fptr)
 end
 
+# FIXME: duplication with (::Kernel)(...)
 @generated function (kernel::DynamicKernel{F,TT})(args...; call_kwargs...) where {F,TT}
-    # TODO
-    call_args = :(args)
-    call_tt = TT
+    sig = Base.signature_type(F, TT)
+    args = (:F, (:( args[$i] ) for i in 1:length(args))...)
+
+    # filter out ghost arguments that shouldn't be passed
+    to_pass = map(!isghosttype, sig.parameters)
+    call_t =                  Type[x[1] for x in zip(sig.parameters,  to_pass) if x[2]]
+    call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass)            if x[2]]
+
+    # replace non-isbits arguments (they should be unused, or compilation would have failed)
+    # alternatively, make CUDAdrv allow `launch` with non-isbits arguments.
+    for (i,dt) in enumerate(call_t)
+        if !isbitstype(dt)
+            call_t[i] = Ptr{Any}
+            call_args[i] = :C_NULL
+        end
+    end
+
+    # finalize types
+    call_tt = Base.to_tuple_type(call_t)
 
     quote
         Base.@_inline_meta
 
-        dynamic_cudacall(kernel.fun, $call_tt, $call_args...; call_kwargs...)
+        dynamic_cudacall(kernel.fun, $call_tt, $(call_args...); call_kwargs...)
     end
 end
 
+# FIXME: duplication with CUDAdrv.cudacall
 @generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...;
                                      blocks::CuDim=1, threads::CuDim=1, shmem::Integer=0,
                                      stream::CuStream=CuDefaultStream())
+    types = tt.parameters[1].parameters     # the type of `tt` is Type{Tuple{<:DataType...}}
+
     ex = quote
         Base.@_inline_meta
     end
 
     # convert the argument values to match the kernel's signature (specified by the user)
     # (this mimics `lower-ccall` in julia-syntax.scm)
-    converted_args = Vector{Symbol}(undef, length(args))
-    arg_ptrs = Vector{Symbol}(undef, length(args))
-    for i in 1:length(args)
+    converted_args = Vector{Symbol}(undef, length(types))
+    arg_ptrs = Vector{Symbol}(undef, length(types))
+    for i in 1:length(types)
         converted_args[i] = gensym()
         arg_ptrs[i] = gensym()
-        push!(ex.args, :($(converted_args[i]) = Base.cconvert($(args[i]), args[$i])))
-        push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(args[i]), $(converted_args[i]))))
+        push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i])))
+        push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(types[i]), $(converted_args[i]))))
     end
 
     append!(ex.args, (quote

From cc0aac65c21f0443f4432e669df3dd60a51ba61b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 20 Mar 2019 16:27:28 +0100
Subject: [PATCH 17/34] Start a Kernel type hierarchy.

---
 src/execution.jl | 54 +++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index e62e10c6..fff604bb 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -3,21 +3,23 @@
 export @cuda, cudaconvert, cufunction, nearest_warpsize
 
 
-## kernel object and query functions
+## host-side kernels
 
-struct Kernel{F,TT}
+abstract type AbstractKernel{F,TT} end
+
+struct HostKernel{F,TT} <: AbstractKernel{F,TT}
     ctx::CuContext
     mod::CuModule
     fun::CuFunction
 end
 
 """
-    version(k::Kernel)
+    version(k::HostKernel)
 
 Queries the PTX and SM versions a kernel was compiled for.
 Returns a named tuple.
 """
-function version(k::Kernel)
+function version(k::HostKernel)
     attr = attributes(k.fun)
     binary_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_BINARY_VERSION],10)...)
     ptx_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_PTX_VERSION],10)...)
@@ -25,12 +27,12 @@ function version(k::Kernel)
 end
 
 """
-    memory(k::Kernel)
+    memory(k::HostKernel)
 
 Queries the local, shared and constant memory usage of a compiled kernel in bytes.
 Returns a named tuple.
 """
-function memory(k::Kernel)
+function memory(k::HostKernel)
     attr = attributes(k.fun)
     local_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES]
     shared_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_SHARED_SIZE_BYTES]
@@ -39,21 +41,21 @@ function memory(k::Kernel)
 end
 
 """
-    registers(k::Kernel)
+    registers(k::HostKernel)
 
 Queries the register usage of a kernel.
 """
-function registers(k::Kernel)
+function registers(k::HostKernel)
     attr = attributes(k.fun)
     return attr[CUDAdrv.FUNC_ATTRIBUTE_NUM_REGS]
 end
 
 """
-    maxthreads(k::Kernel)
+    maxthreads(k::HostKernel)
 
 Queries the maximum amount of threads a kernel can use in a single block.
 """
-function maxthreads(k::Kernel)
+function maxthreads(k::HostKernel)
     attr = attributes(k.fun)
     return attr[CUDAdrv.FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK]
 end
@@ -171,7 +173,7 @@ performed, scheduling a kernel launch on the current CUDA context.
 Several keyword arguments are supported that influence the behavior of `@cuda`.
 - `dynamic`: use dynamic parallelism to launch device-side kernels
 - arguments that influence kernel compilation: see [`cufunction`](@ref)
-- arguments that influence kernel execution: see [`CUDAnative.Kernel`](@ref)
+- arguments that influence kernel execution: see [`CUDAnative.HostKernel`](@ref)
 
 The underlying operations (argument conversion, kernel compilation, kernel call) can be
 performed explicitly when more control is needed, e.g. to reflect on the resource usage of a
@@ -245,10 +247,10 @@ macro cuda(ex...)
 end
 
 
-## APIs for manual compilation
+## host-side launch API
 
 const agecache = Dict{UInt, UInt}()
-const compilecache = Dict{UInt, Kernel}()
+const compilecache = Dict{UInt, HostKernel}()
 
 """
     cufunction(f, tt=Tuple{}; kwargs...)
@@ -301,7 +303,7 @@ when function changes, or when different types or keyword arguments are provided
             dev = device(ctx)
             cap = supported_capability(dev)
             fun, mod = compile(:cuda, cap, f, tt; kwargs...)
-            kernel = Kernel{f,tt}(ctx, mod, fun)
+            kernel = HostKernel{f,tt}(ctx, mod, fun)
             @debug begin
                 ver = version(kernel)
                 mem = memory(kernel)
@@ -312,11 +314,11 @@ when function changes, or when different types or keyword arguments are provided
             compilecache[key] = kernel
         end
 
-        return compilecache[key]::Kernel{f,tt}
+        return compilecache[key]::HostKernel{f,tt}
     end
 end
 
-@generated function (kernel::Kernel{F,TT})(args...; call_kwargs...) where {F,TT}
+@generated function (kernel::HostKernel{F,TT})(args...; call_kwargs...) where {F,TT}
     sig = Base.signature_type(F, TT)
     args = (:F, (:( args[$i] ) for i in 1:length(args))...)
 
@@ -347,7 +349,7 @@ end
 # FIXME: there doesn't seem to be a way to access the documentation for the call-syntax,
 #        so attach it to the type
 """
-    (::Kernel)(args...; kwargs...)
+    (::HostKernel)(args...; kwargs...)
 
 Low-level interface to call a compiled kernel, passing GPU-compatible arguments in `args`.
 For a higher-level interface, use [`@cuda`](@ref).
@@ -358,23 +360,23 @@ The following keyword arguments are supported:
 - `shmem` (defaults to 0)
 - `stream` (defaults to the default stream)
 """
-Kernel
+HostKernel
 
 
-## dynamic parallelism
+## device-side kernels and launch API (aka. dynamic parallelism)
 
-struct DynamicKernel{F,TT}
+struct DeviceKernel{F,TT} <: AbstractKernel{F,TT}
     fun::Ptr{Cvoid}
 end
 
 function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
     # we can't compile here, so drop a marker which will get picked up during compilation
     fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt)
-    DynamicKernel{f,tt}(fptr)
+    DeviceKernel{f,tt}(fptr)
 end
 
-# FIXME: duplication with (::Kernel)(...)
-@generated function (kernel::DynamicKernel{F,TT})(args...; call_kwargs...) where {F,TT}
+# FIXME: duplication with (::HostKernel)(...)
+@generated function (kernel::DeviceKernel{F,TT})(args...; call_kwargs...) where {F,TT}
     sig = Base.signature_type(F, TT)
     args = (:F, (:( args[$i] ) for i in 1:length(args))...)
 
@@ -414,9 +416,9 @@ end
 
     # convert the argument values to match the kernel's signature (specified by the user)
     # (this mimics `lower-ccall` in julia-syntax.scm)
-    converted_args = Vector{Symbol}(undef, length(types))
-    arg_ptrs = Vector{Symbol}(undef, length(types))
-    for i in 1:length(types)
+    converted_args = Vector{Symbol}(undef, length(args))
+    arg_ptrs = Vector{Symbol}(undef, length(args))
+    for i in 1:length(args)
         converted_args[i] = gensym()
         arg_ptrs[i] = gensym()
         push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i])))

From 8645bb08a6a3dc4ae1ba9e536cc9c63cb093114a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 Mar 2019 08:47:03 +0100
Subject: [PATCH 18/34] Merge identical dynamic kernel invocations.

---
 src/compiler/driver.jl | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 8ef39bd9..1e280fa4 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -98,7 +98,7 @@ function compile(to::Symbol, ctx::CompilerContext;
 
         # find dynamic kernel invocations
         # TODO: recover this information earlier, from the Julia IR
-        worklist = []
+        worklist = Dict{Tuple{Core.Function,Type}, Vector{LLVM.CallInst}}()
         for use in uses(f)
             # decode the call
             call = user(use)::LLVM.CallInst
@@ -113,11 +113,12 @@ function compile(to::Symbol, ctx::CompilerContext;
             ops = Ptr{Any}.(ops)
 
             dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops)
-            push!(worklist, (call, dyn_f, dyn_tt))
+            calls = get!(worklist, (dyn_f, dyn_tt), LLVM.CallInst[])
+            push!(calls, call)
         end
 
         # compile and link
-        for (call, dyn_f, dyn_tt) in worklist
+        for (dyn_f, dyn_tt) in keys(worklist)
             dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true)
             dyn_ir, dyn_entry =
                 compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip)
@@ -127,8 +128,11 @@ function compile(to::Symbol, ctx::CompilerContext;
             dyn_ir = nothing
             dyn_entry = functions(ir)[dyn_fn]
 
-            replace_uses!(call, dyn_entry)
-            unsafe_delete!(LLVM.parent(call), call)
+            # insert a call everywhere the kernel is used
+            for call in worklist[(dyn_f,dyn_tt)]
+                replace_uses!(call, dyn_entry)
+                unsafe_delete!(LLVM.parent(call), call)
+            end
         end
 
         @compiler_assert isempty(uses(f)) ctx

From 695fda4d123bc4236c01503645650c36353e4c46 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 Mar 2019 09:01:19 +0100
Subject: [PATCH 19/34] Detect closures.

---
 src/CUDAnative.jl  |  1 +
 src/device/llvm.jl |  3 +++
 src/execution.jl   | 19 +++++++++++++++----
 3 files changed, 19 insertions(+), 4 deletions(-)
 create mode 100644 src/device/llvm.jl

diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl
index cec0b9cf..018729ae 100644
--- a/src/CUDAnative.jl
+++ b/src/CUDAnative.jl
@@ -27,6 +27,7 @@ include(joinpath("device", "tools.jl"))
 include(joinpath("device", "pointer.jl"))
 include(joinpath("device", "array.jl"))
 include(joinpath("device", "cuda.jl"))
+include(joinpath("device", "llvm.jl"))
 include(joinpath("device", "runtime.jl"))
 
 include("compiler.jl")
diff --git a/src/device/llvm.jl b/src/device/llvm.jl
new file mode 100644
index 00000000..ddf7364d
--- /dev/null
+++ b/src/device/llvm.jl
@@ -0,0 +1,3 @@
+# wrappers for LLVM-specific functionality
+
+@inline trap() = ccall("llvm.trap", llvmcall, Cvoid, ())
diff --git a/src/execution.jl b/src/execution.jl
index fff604bb..54882cf4 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -369,10 +369,21 @@ struct DeviceKernel{F,TT} <: AbstractKernel{F,TT}
     fun::Ptr{Cvoid}
 end
 
-function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
-    # we can't compile here, so drop a marker which will get picked up during compilation
-    fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Any, Any), f, tt)
-    DeviceKernel{f,tt}(fptr)
+@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
+    if sizeof(f) > 0
+        Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures")
+        quote
+            trap()
+            DeviceKernel{f,tt}(C_NULL)
+        end
+    else
+        # we can't compile here, so drop a marker which will get picked up during compilation
+        quote
+            fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid},
+                         (Any, Any), f, tt)
+            DeviceKernel{f,tt}(fptr)
+        end
+    end
 end
 
 # FIXME: duplication with (::HostKernel)(...)

From 7bd4628add0bde44b5055589c3503735d5a8a4c4 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 Mar 2019 10:20:19 +0100
Subject: [PATCH 20/34] Avoid duplication by attaching call to AbstractKernel.

---
 src/execution.jl | 343 +++++++++++++++++++++++------------------------
 1 file changed, 168 insertions(+), 175 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index 54882cf4..20ffb2c9 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -3,64 +3,6 @@
 export @cuda, cudaconvert, cufunction, nearest_warpsize
 
 
-## host-side kernels
-
-abstract type AbstractKernel{F,TT} end
-
-struct HostKernel{F,TT} <: AbstractKernel{F,TT}
-    ctx::CuContext
-    mod::CuModule
-    fun::CuFunction
-end
-
-"""
-    version(k::HostKernel)
-
-Queries the PTX and SM versions a kernel was compiled for.
-Returns a named tuple.
-"""
-function version(k::HostKernel)
-    attr = attributes(k.fun)
-    binary_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_BINARY_VERSION],10)...)
-    ptx_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_PTX_VERSION],10)...)
-    return (ptx=ptx_ver, binary=binary_ver)
-end
-
-"""
-    memory(k::HostKernel)
-
-Queries the local, shared and constant memory usage of a compiled kernel in bytes.
-Returns a named tuple.
-"""
-function memory(k::HostKernel)
-    attr = attributes(k.fun)
-    local_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES]
-    shared_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_SHARED_SIZE_BYTES]
-    constant_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_CONST_SIZE_BYTES]
-    return (:local=>local_mem, shared=shared_mem, constant=constant_mem)
-end
-
-"""
-    registers(k::HostKernel)
-
-Queries the register usage of a kernel.
-"""
-function registers(k::HostKernel)
-    attr = attributes(k.fun)
-    return attr[CUDAdrv.FUNC_ATTRIBUTE_NUM_REGS]
-end
-
-"""
-    maxthreads(k::HostKernel)
-
-Queries the maximum amount of threads a kernel can use in a single block.
-"""
-function maxthreads(k::HostKernel)
-    attr = attributes(k.fun)
-    return attr[CUDAdrv.FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK]
-end
-
-
 ## helper functions
 
 # split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and
@@ -131,34 +73,6 @@ function method_age(f, tt)::UInt
 end
 
 
-## adaptors
-
-struct Adaptor end
-
-# convert CUDAdrv pointers to CUDAnative pointers
-Adapt.adapt_storage(to::Adaptor, p::CuPtr{T}) where {T} = DevicePtr{T,AS.Generic}(p)
-
-# Base.RefValue isn't GPU compatible, so provide a compatible alternative
-struct CuRefValue{T} <: Ref{T}
-  x::T
-end
-Base.getindex(r::CuRefValue) = r.x
-Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[]))
-
-# convenience function
-"""
-    cudaconvert(x)
-
-This function is called for every argument to be passed to a kernel, allowing it to be
-converted to a GPU-friendly format. By default, the function does nothing and returns the
-input object `x` as-is.
-
-Do not add methods to this function, but instead extend the underlying Adapt.jl package and
-register methods for the the `CUDAnative.Adaptor` type.
-"""
-cudaconvert(arg) = adapt(Adaptor(), arg)
-
-
 ## high-level @cuda interface
 
 """
@@ -247,7 +161,145 @@ macro cuda(ex...)
 end
 
 
-## host-side launch API
+## host to device value conversion
+
+struct Adaptor end
+
+# convert CUDAdrv pointers to CUDAnative pointers
+Adapt.adapt_storage(to::Adaptor, p::CuPtr{T}) where {T} = DevicePtr{T,AS.Generic}(p)
+
+# Base.RefValue isn't GPU compatible, so provide a compatible alternative
+struct CuRefValue{T} <: Ref{T}
+  x::T
+end
+Base.getindex(r::CuRefValue) = r.x
+Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[]))
+
+"""
+    cudaconvert(x)
+
+This function is called for every argument to be passed to a kernel, allowing it to be
+converted to a GPU-friendly format. By default, the function does nothing and returns the
+input object `x` as-is.
+
+Do not add methods to this function, but instead extend the underlying Adapt.jl package and
+register methods for the the `CUDAnative.Adaptor` type.
+"""
+cudaconvert(arg) = adapt(Adaptor(), arg)
+
+
+## abstract kernel functionality
+
+abstract type AbstractKernel{F,TT} end
+
+# FIXME: there doesn't seem to be a way to access the documentation for the call-syntax,
+#        so attach it to the type
+"""
+    (::HostKernel)(args...; kwargs...)
+    (::DeviceKernel)(args...; kwargs...)
+
+Low-level interface to call a compiled kernel, passing GPU-compatible arguments in `args`.
+For a higher-level interface, use [`@cuda`](@ref).
+
+The following keyword arguments are supported:
+- `threads` (defaults to 1)
+- `blocks` (defaults to 1)
+- `shmem` (defaults to 0)
+- `stream` (defaults to the default stream)
+"""
+AbstractKernel
+
+@generated function call(kernel::AbstractKernel{F,TT}, args...; call_kwargs...) where {F,TT}
+    sig = Base.signature_type(F, TT)
+    args = (:F, (:( args[$i] ) for i in 1:length(args))...)
+
+    # filter out ghost arguments that shouldn't be passed
+    to_pass = map(!isghosttype, sig.parameters)
+    call_t =                  Type[x[1] for x in zip(sig.parameters,  to_pass) if x[2]]
+    call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass)            if x[2]]
+
+    # replace non-isbits arguments (they should be unused, or compilation would have failed)
+    # alternatively, make CUDAdrv allow `launch` with non-isbits arguments.
+    for (i,dt) in enumerate(call_t)
+        if !isbitstype(dt)
+            call_t[i] = Ptr{Any}
+            call_args[i] = :C_NULL
+        end
+    end
+
+    # finalize types
+    call_tt = Base.to_tuple_type(call_t)
+
+    quote
+        Base.@_inline_meta
+
+        cudacall(kernel, $call_tt, $(call_args...); call_kwargs...)
+    end
+end
+
+
+## host-side kernels
+
+struct HostKernel{F,TT} <: AbstractKernel{F,TT}
+    ctx::CuContext
+    mod::CuModule
+    fun::CuFunction
+end
+
+@doc (@doc AbstractKernel) HostKernel
+
+@inline cudacall(kernel::HostKernel, tt, args...; kwargs...) =
+    CUDAdrv.cudacall(kernel.fun, tt, args...; kwargs...)
+
+"""
+    version(k::HostKernel)
+
+Queries the PTX and SM versions a kernel was compiled for.
+Returns a named tuple.
+"""
+function version(k::HostKernel)
+    attr = attributes(k.fun)
+    binary_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_BINARY_VERSION],10)...)
+    ptx_ver = VersionNumber(divrem(attr[CUDAdrv.FUNC_ATTRIBUTE_PTX_VERSION],10)...)
+    return (ptx=ptx_ver, binary=binary_ver)
+end
+
+"""
+    memory(k::HostKernel)
+
+Queries the local, shared and constant memory usage of a compiled kernel in bytes.
+Returns a named tuple.
+"""
+function memory(k::HostKernel)
+    attr = attributes(k.fun)
+    local_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES]
+    shared_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_SHARED_SIZE_BYTES]
+    constant_mem = attr[CUDAdrv.FUNC_ATTRIBUTE_CONST_SIZE_BYTES]
+    return (:local=>local_mem, shared=shared_mem, constant=constant_mem)
+end
+
+"""
+    registers(k::HostKernel)
+
+Queries the register usage of a kernel.
+"""
+function registers(k::HostKernel)
+    attr = attributes(k.fun)
+    return attr[CUDAdrv.FUNC_ATTRIBUTE_NUM_REGS]
+end
+
+"""
+    maxthreads(k::HostKernel)
+
+Queries the maximum amount of threads a kernel can use in a single block.
+"""
+function maxthreads(k::HostKernel)
+    attr = attributes(k.fun)
+    return attr[CUDAdrv.FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK]
+end
+
+
+## host-side API
 
 const agecache = Dict{UInt, UInt}()
 const compilecache = Dict{UInt, HostKernel}()
@@ -318,102 +370,20 @@ when function changes, or when different types or keyword arguments are provided
     end
 end
 
-@generated function (kernel::HostKernel{F,TT})(args...; call_kwargs...) where {F,TT}
-    sig = Base.signature_type(F, TT)
-    args = (:F, (:( args[$i] ) for i in 1:length(args))...)
+# https://github.com/JuliaLang/julia/issues/14919
+(kernel::HostKernel)(args...; kwargs...) = call(kernel, args...; kwargs...)
 
-    # filter out ghost arguments that shouldn't be passed
-    to_pass = map(!isghosttype, sig.parameters)
-    call_t =                  Type[x[1] for x in zip(sig.parameters,  to_pass) if x[2]]
-    call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass)            if x[2]]
 
-    # replace non-isbits arguments (they should be unused, or compilation would have failed)
-    # alternatively, make CUDAdrv allow `launch` with non-isbits arguments.
-    for (i,dt) in enumerate(call_t)
-        if !isbitstype(dt)
-            call_t[i] = Ptr{Any}
-            call_args[i] = :C_NULL
-        end
-    end
-
-    # finalize types
-    call_tt = Base.to_tuple_type(call_t)
-
-    quote
-        Base.@_inline_meta
-
-        cudacall(kernel.fun, $call_tt, $(call_args...); call_kwargs...)
-    end
-end
-
-# FIXME: there doesn't seem to be a way to access the documentation for the call-syntax,
-#        so attach it to the type
-"""
-    (::HostKernel)(args...; kwargs...)
-
-Low-level interface to call a compiled kernel, passing GPU-compatible arguments in `args`.
-For a higher-level interface, use [`@cuda`](@ref).
-
-The following keyword arguments are supported:
-- `threads` (defaults to 1)
-- `blocks` (defaults to 1)
-- `shmem` (defaults to 0)
-- `stream` (defaults to the default stream)
-"""
-HostKernel
-
-
-## device-side kernels and launch API (aka. dynamic parallelism)
+## device-side kernels
 
 struct DeviceKernel{F,TT} <: AbstractKernel{F,TT}
     fun::Ptr{Cvoid}
 end
 
-@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
-    if sizeof(f) > 0
-        Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures")
-        quote
-            trap()
-            DeviceKernel{f,tt}(C_NULL)
-        end
-    else
-        # we can't compile here, so drop a marker which will get picked up during compilation
-        quote
-            fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid},
-                         (Any, Any), f, tt)
-            DeviceKernel{f,tt}(fptr)
-        end
-    end
-end
-
-# FIXME: duplication with (::HostKernel)(...)
-@generated function (kernel::DeviceKernel{F,TT})(args...; call_kwargs...) where {F,TT}
-    sig = Base.signature_type(F, TT)
-    args = (:F, (:( args[$i] ) for i in 1:length(args))...)
-
-    # filter out ghost arguments that shouldn't be passed
-    to_pass = map(!isghosttype, sig.parameters)
-    call_t =                  Type[x[1] for x in zip(sig.parameters,  to_pass) if x[2]]
-    call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass)            if x[2]]
-
-    # replace non-isbits arguments (they should be unused, or compilation would have failed)
-    # alternatively, make CUDAdrv allow `launch` with non-isbits arguments.
-    for (i,dt) in enumerate(call_t)
-        if !isbitstype(dt)
-            call_t[i] = Ptr{Any}
-            call_args[i] = :C_NULL
-        end
-    end
-
-    # finalize types
-    call_tt = Base.to_tuple_type(call_t)
-
-    quote
-        Base.@_inline_meta
+@doc (@doc AbstractKernel) DeviceKernel
 
-        dynamic_cudacall(kernel.fun, $call_tt, $(call_args...); call_kwargs...)
-    end
-end
+@inline cudacall(kernel::DeviceKernel, tt, args...; kwargs...) =
+    dynamic_cudacall(kernel.fun, tt, args...; kwargs...)
 
 # FIXME: duplication with CUDAdrv.cudacall
 @generated function dynamic_cudacall(f::Ptr{Cvoid}, tt::Type, args...;
@@ -446,6 +416,29 @@ end
 end
 
 
+## device-side API
+
+@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
+    if sizeof(f) > 0
+        Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures")
+        quote
+            trap()
+            DeviceKernel{f,tt}(C_NULL)
+        end
+    else
+        # we can't compile here, so drop a marker which will get picked up during compilation
+        quote
+            fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid},
+                         (Any, Any), f, tt)
+            DeviceKernel{f,tt}(fptr)
+        end
+    end
+end
+
+# https://github.com/JuliaLang/julia/issues/14919
+(kernel::DeviceKernel)(args...; kwargs...) = call(kernel, args...; kwargs...)
+
+
 ## other
 
 """

From b74ed8f11df566f9f9dcaf37e9d11119978fb9e4 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 Mar 2019 10:38:18 +0100
Subject: [PATCH 21/34] Documentation, error checking, etc.

---
 src/execution.jl | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index 20ffb2c9..428f586a 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -86,8 +86,10 @@ performed, scheduling a kernel launch on the current CUDA context.
 
 Several keyword arguments are supported that influence the behavior of `@cuda`.
 - `dynamic`: use dynamic parallelism to launch device-side kernels
-- arguments that influence kernel compilation: see [`cufunction`](@ref)
-- arguments that influence kernel execution: see [`CUDAnative.HostKernel`](@ref)
+- arguments that influence kernel compilation: see [`cufunction`](@ref) and
+  [`dynamic_cufunction`](@ref)
+- arguments that influence kernel launch: see [`CUDAnative.HostKernel`](@ref) and
+  [`CUDAnative.DeviceKernel`](@ref)
 
 The underlying operations (argument conversion, kernel compilation, kernel call) can be
 performed explicitly when more control is needed, e.g. to reflect on the resource usage of a
@@ -100,6 +102,15 @@ kernel to determine the launch configuration. A host-side kernel launch is done
         kernel = cufunction(f, kernel_tt; compilation_kwargs)
         kernel(kernel_args...; launch_kwargs)
     end
+
+A device-side launch, aka. dynamic parallelism, is similar but more restricted:
+
+    args = ...
+    # GC.@preserve is not supported
+    # we're on the device already, so no need to cudaconvert
+    kernel_tt = Tuple{Core.Typeof(args[1]), ...}    # this needs to be fully inferred!
+    kernel = dynamic_cufunction(f, kernel_tt)       # no compiler kwargs supported
+    kernel(args...; launch_kwargs)
 """
 macro cuda(ex...)
     # destructure the `@cuda` expression
@@ -125,6 +136,7 @@ macro cuda(ex...)
     for kwarg in macro_kwargs
         key,val = kwarg.args
         if key == :dynamic
+            isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @cuda should be a constant value"))
             dynamic = val::Bool
         else
             throw(ArgumentError("Unsupported keyword argument '$key'"))
@@ -132,6 +144,11 @@ macro cuda(ex...)
     end
 
     if dynamic
+        # FIXME: we could probably somehow support kwargs with constant values by either
+        #        saving them in a global Dict here, or trying to pick them up from the Julia
+        #        IR when processing the dynamic parallelism marker
+        isempty(compiler_kwargs) || error("@cuda dynamic parallelism does not support compiler keyword arguments")
+
         # dynamic, device-side kernel launch
         push!(code.args,
             quote
@@ -418,6 +435,14 @@ end
 
 ## device-side API
 
+"""
+    dynamic_cufunction(f, tt=Tuple{})
+
+Low-level interface to compile a function invocation for the currently-active GPU, returning
+a callable kernel object. Device-side equivalent of [`CUDAnative.cufunction`](@ref).
+
+No keyword arguments are supported.
+"""
 @generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
     if sizeof(f) > 0
         Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures")

From cd6e2aef4f3948afd598b71c0fade8393d249193 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 Mar 2019 11:10:38 +0100
Subject: [PATCH 22/34] Rename CompilerContext to CompilerJob.

It's not really a context. Also disambiguates from CUDA's and LLVM's contexts.
---
 src/compiler/common.jl     | 38 +++++++++++-----------
 src/compiler/driver.jl     | 40 +++++++++++------------
 src/compiler/irgen.jl      | 30 ++++++++---------
 src/compiler/mcgen.jl      | 12 +++----
 src/compiler/optim.jl      | 52 +++++++++++++++---------------
 src/compiler/rtlib.jl      |  6 ++--
 src/compiler/validation.jl | 46 +++++++++++++-------------
 src/reflection.jl          | 66 +++++++++++++++++++-------------------
 8 files changed, 145 insertions(+), 145 deletions(-)

diff --git a/src/compiler/common.jl b/src/compiler/common.jl
index f11eee5f..b9160a5f 100644
--- a/src/compiler/common.jl
+++ b/src/compiler/common.jl
@@ -1,6 +1,6 @@
 # common functionality
 
-struct CompilerContext
+struct CompilerJob
     # core invocation
     f::Core.Function
     tt::DataType
@@ -13,38 +13,38 @@ struct CompilerContext
     blocks_per_sm::Union{Nothing,Integer}
     maxregs::Union{Nothing,Integer}
 
-    CompilerContext(f, tt, cap, kernel;
+    CompilerJob(f, tt, cap, kernel;
                     minthreads=nothing, maxthreads=nothing,
                     blocks_per_sm=nothing, maxregs=nothing) =
         new(f, tt, cap, kernel, minthreads, maxthreads, blocks_per_sm, maxregs)
 end
 
-# global context reference
-# FIXME: thread through `ctx` everywhere (deadlocks the Julia compiler when doing so with
+# global job reference
+# FIXME: thread through `job` everywhere (deadlocks the Julia compiler when doing so with
 #        the LLVM passes in CUDAnative)
-global_ctx = nothing
+current_job = nothing
 
 
-function signature(ctx::CompilerContext)
-    fn = typeof(ctx.f).name.mt.name
-    args = join(ctx.tt.parameters, ", ")
-    return "$fn($(join(ctx.tt.parameters, ", ")))"
+function signature(job::CompilerJob)
+    fn = typeof(job.f).name.mt.name
+    args = join(job.tt.parameters, ", ")
+    return "$fn($(join(job.tt.parameters, ", ")))"
 end
 
 
 struct KernelError <: Exception
-    ctx::CompilerContext
+    job::CompilerJob
     message::String
     help::Union{Nothing,String}
     bt::StackTraces.StackTrace
 
-    KernelError(ctx::CompilerContext, message::String, help=nothing;
+    KernelError(job::CompilerJob, message::String, help=nothing;
                 bt=StackTraces.StackTrace()) =
-        new(ctx, message, help, bt)
+        new(job, message, help, bt)
 end
 
 function Base.showerror(io::IO, err::KernelError)
-    println(io, "GPU compilation of $(signature(err.ctx)) failed")
+    println(io, "GPU compilation of $(signature(err.job)) failed")
     println(io, "KernelError: $(err.message)")
     println(io)
     println(io, something(err.help, "Try inspecting the generated code with any of the @device_code_... macros."))
@@ -53,10 +53,10 @@ end
 
 
 struct InternalCompilerError <: Exception
-    ctx::CompilerContext
+    job::CompilerJob
     message::String
     meta::Dict
-    InternalCompilerError(ctx, message; kwargs...) = new(ctx, message, kwargs)
+    InternalCompilerError(job, message; kwargs...) = new(job, message, kwargs)
 end
 
 function Base.showerror(io::IO, err::InternalCompilerError)
@@ -67,8 +67,8 @@ function Base.showerror(io::IO, err::InternalCompilerError)
     println(io, "\nInternalCompilerError: $(err.message)")
 
     println(io, "\nCompiler invocation:")
-    for field in fieldnames(CompilerContext)
-        println(io, " - $field = $(repr(getfield(err.ctx, field)))")
+    for field in fieldnames(CompilerJob)
+        println(io, " - $field = $(repr(getfield(err.job, field)))")
     end
 
     if !isempty(err.meta)
@@ -87,10 +87,10 @@ function Base.showerror(io::IO, err::InternalCompilerError)
     versioninfo(io)
 end
 
-macro compiler_assert(ex, ctx, kwargs...)
+macro compiler_assert(ex, job, kwargs...)
     msg = "$ex, at $(__source__.file):$(__source__.line)"
     return :($(esc(ex)) ? $(nothing)
-                        : throw(InternalCompilerError($(esc(ctx)), $msg;
+                        : throw(InternalCompilerError($(esc(job)), $msg;
                                                       $(map(esc, kwargs)...)))
             )
 end
diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 1e280fa4..9687b050 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -1,6 +1,6 @@
 # compiler driver and main interface
 
-# (::CompilerContext)
+# (::CompilerJob)
 const compile_hook = Ref{Union{Nothing,Function}}(nothing)
 
 """
@@ -22,18 +22,18 @@ Other keyword arguments can be found in the documentation of [`cufunction`](@ref
 compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt),
         kernel::Bool=true; hooks::Bool=true, optimize::Bool=true, strip::Bool=false,
         kwargs...) =
-    compile(to, CompilerContext(f, tt, cap, kernel; kwargs...);
+    compile(to, CompilerJob(f, tt, cap, kernel; kwargs...);
             hooks=hooks, optimize=optimize, strip=strip)
 
-function compile(to::Symbol, ctx::CompilerContext;
+function compile(to::Symbol, job::CompilerJob;
                  hooks::Bool=true, optimize::Bool=true, strip::Bool=false)
-    @debug "(Re)compiling function" ctx
+    @debug "(Re)compiling function" job
 
     if hooks && compile_hook[] != nothing
         global globalUnique
         previous_globalUnique = globalUnique
 
-        compile_hook[](ctx)
+        compile_hook[](job)
 
         globalUnique = previous_globalUnique
     end
@@ -41,12 +41,12 @@ function compile(to::Symbol, ctx::CompilerContext;
 
     ## Julia IR
 
-    check_method(ctx)
+    check_method(job)
 
     # get the method instance
     world = typemax(UInt)
-    meth = which(ctx.f, ctx.tt)
-    sig = Base.signature_type(ctx.f, ctx.tt)::Type
+    meth = which(job.f, job.tt)
+    sig = Base.signature_type(job.f, job.tt)::Type
     (ti, env) = ccall(:jl_type_intersection_with_env, Any,
                       (Any, Any), sig, meth.sig)::Core.SimpleVector
     if VERSION >= v"1.2.0-DEV.320"
@@ -62,26 +62,26 @@ function compile(to::Symbol, ctx::CompilerContext;
 
     ## LLVM IR
 
-    ir, entry = irgen(ctx, linfo, world)
+    ir, entry = irgen(job, linfo, world)
 
     need_library(lib) = any(f -> isdeclaration(f) &&
                                  intrinsic_id(f) == 0 &&
                                  haskey(functions(lib), LLVM.name(f)),
                             functions(ir))
 
-    libdevice = load_libdevice(ctx.cap)
+    libdevice = load_libdevice(job.cap)
     if need_library(libdevice)
-        link_libdevice!(ctx, ir, libdevice)
+        link_libdevice!(job, ir, libdevice)
     end
 
     # optimize the IR
     if optimize
-        entry = optimize!(ctx, ir, entry)
+        entry = optimize!(job, ir, entry)
     end
 
-    runtime = load_runtime(ctx.cap)
+    runtime = load_runtime(job.cap)
     if need_library(runtime)
-        link_library!(ctx, ir, runtime)
+        link_library!(job, ir, runtime)
     end
 
     verify(ir)
@@ -119,7 +119,7 @@ function compile(to::Symbol, ctx::CompilerContext;
 
         # compile and link
         for (dyn_f, dyn_tt) in keys(worklist)
-            dyn_ctx = CompilerContext(dyn_f, dyn_tt, ctx.cap, true)
+            dyn_ctx = CompilerJob(dyn_f, dyn_tt, job.cap, true)
             dyn_ir, dyn_entry =
                 compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip)
 
@@ -135,7 +135,7 @@ function compile(to::Symbol, ctx::CompilerContext;
             end
         end
 
-        @compiler_assert isempty(uses(f)) ctx
+        @compiler_assert isempty(uses(f)) job
         unsafe_delete!(ir, f)
     end
 
@@ -144,12 +144,12 @@ function compile(to::Symbol, ctx::CompilerContext;
 
     ## PTX machine code
 
-    prepare_execution!(ctx, ir)
+    prepare_execution!(job, ir)
 
-    check_invocation(ctx, entry)
-    check_ir(ctx, ir)
+    check_invocation(job, entry)
+    check_ir(job, ir)
 
-    asm = mcgen(ctx, ir, entry)
+    asm = mcgen(job, ir, entry)
 
     to == :ptx && return asm, LLVM.name(entry)
 
diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl
index e9677092..23361722 100644
--- a/src/compiler/irgen.jl
+++ b/src/compiler/irgen.jl
@@ -16,7 +16,7 @@ safe_fn(f::Core.Function) = safe_fn(String(typeof(f).name.mt.name))
 safe_fn(f::LLVM.Function) = safe_fn(LLVM.name(f))
 
 # generate a pseudo-backtrace from a stack of methods being emitted
-function backtrace(ctx::CompilerContext, method_stack::Vector{Core.MethodInstance})
+function backtrace(job::CompilerJob, method_stack::Vector{Core.MethodInstance})
     bt = StackTraces.StackFrame[]
     for method_instance in method_stack
         method = method_instance.def
@@ -34,7 +34,7 @@ end
 Base.showerror(io::IO, err::MethodSubstitutionWarning) =
     print(io, "You called $(err.original), maybe you intended to call $(err.substitute) instead?")
 
-function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world)
+function compile_linfo(job::CompilerJob, linfo::Core.MethodInstance, world)
     # set-up the compiler interface
     function hook_module_setup(ref::Ptr{Cvoid})
         ref = convert(LLVM.API.LLVMModuleRef, ref)
@@ -51,8 +51,8 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world)
 
         # check for recursion
         if method_instance in method_stack[1:end-1]
-            throw(KernelError(ctx, "recursion is currently not supported";
-                              bt=backtrace(ctx, method_stack)))
+            throw(KernelError(job, "recursion is currently not supported";
+                              bt=backtrace(job, method_stack)))
         end
 
         # check for Base methods that exist in CUDAnative too
@@ -65,13 +65,13 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world)
             if hasmethod(substitute_function, tt)
                 method′ = which(substitute_function, tt)
                 if Base.moduleroot(method′.module) == CUDAnative
-                    @warn "calls to Base intrinsics might be GPU incompatible" exception=(MethodSubstitutionWarning(method, method′), backtrace(ctx, method_stack))
+                    @warn "calls to Base intrinsics might be GPU incompatible" exception=(MethodSubstitutionWarning(method, method′), backtrace(job, method_stack))
                 end
             end
         end
     end
     function hook_emitted_function(method, code, world)
-        @compiler_assert last(method_stack) == method ctx
+        @compiler_assert last(method_stack) == method job
         pop!(method_stack)
     end
     params = Base.CodegenParams(cached             = false,
@@ -89,7 +89,7 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world)
                 (Any, UInt, Bool, Bool, Base.CodegenParams),
                 linfo, world, #=wrapper=#false, #=optimize=#false, params)
     if ref == C_NULL
-        throw(InternalCompilerError(ctx, "the Julia compiler could not generate LLVM IR"))
+        throw(InternalCompilerError(job, "the Julia compiler could not generate LLVM IR"))
     end
     llvmf = LLVM.Function(ref)
 
@@ -97,8 +97,8 @@ function compile_linfo(ctx::CompilerContext, linfo::Core.MethodInstance, world)
     return llvmf, modules
 end
 
-function irgen(ctx::CompilerContext, linfo::Core.MethodInstance, world)
-    entry, modules = compile_linfo(ctx, linfo, world)
+function irgen(job::CompilerJob, linfo::Core.MethodInstance, world)
+    entry, modules = compile_linfo(job, linfo, world)
 
     # link in dependent modules
     mod = popfirst!(modules)
@@ -165,8 +165,8 @@ function irgen(ctx::CompilerContext, linfo::Core.MethodInstance, world)
 
     # minimal required optimization
     ModulePassManager() do pm
-        global global_ctx
-        global_ctx = ctx
+        global current_job
+        current_job = job
 
         add!(pm, ModulePass("LowerThrow", lower_throw!))
         add!(pm, FunctionPass("HideUnreachable", hide_unreachable!))
@@ -187,7 +187,7 @@ end
 # once we have thorough inference (ie. discarding `@nospecialize` and thus supporting
 # exception arguments) and proper debug info to unwind the stack, this pass can go.
 function lower_throw!(mod::LLVM.Module)
-    ctx = global_ctx::CompilerContext
+    job = current_job::CompilerJob
     changed = false
 
     throw_functions = Dict{String,String}(
@@ -241,7 +241,7 @@ function lower_throw!(mod::LLVM.Module)
                 changed = true
             end
 
-            @compiler_assert isempty(uses(f)) ctx
+            @compiler_assert isempty(uses(f)) job
          end
      end
 
@@ -300,7 +300,7 @@ end
 #       only to prevent introducing non-structureness during optimization (ie. the front-end
 #       is still responsible for generating structured control flow).
 function hide_unreachable!(fun::LLVM.Function)
-    ctx = global_ctx::CompilerContext
+    job = current_job::CompilerJob
     changed = false
 
     # remove `noreturn` attributes
@@ -413,7 +413,7 @@ end
 #
 # if LLVM knows we're trapping, code is marked `unreachable` (see `hide_unreachable!`).
 function hide_trap!(mod::LLVM.Module)
-    ctx = global_ctx::CompilerContext
+    job = current_job::CompilerJob
     changed = false
 
     # inline assembly to exit a thread, hiding control flow from LLVM
diff --git a/src/compiler/mcgen.jl b/src/compiler/mcgen.jl
index 3d46db44..ef309fe1 100644
--- a/src/compiler/mcgen.jl
+++ b/src/compiler/mcgen.jl
@@ -21,10 +21,10 @@ end
 
 # final preparations for the module to be compiled to PTX
 # these passes should not be run when e.g. compiling to write to disk.
-function prepare_execution!(ctx::CompilerContext, mod::LLVM.Module)
+function prepare_execution!(job::CompilerJob, mod::LLVM.Module)
     let pm = ModulePassManager()
-        global global_ctx
-        global_ctx = ctx
+        global current_job
+        current_job = job
 
         global_optimizer!(pm)
 
@@ -49,7 +49,7 @@ end
 #
 # this pass performs that resolution at link time.
 function resolve_cpu_references!(mod::LLVM.Module)
-    ctx = global_ctx::CompilerContext
+    job = current_job::CompilerJob
     changed = false
 
     for f in functions(mod)
@@ -85,8 +85,8 @@ function resolve_cpu_references!(mod::LLVM.Module)
     return changed
 end
 
-function mcgen(ctx::CompilerContext, mod::LLVM.Module, f::LLVM.Function)
-    tm = machine(ctx.cap, triple(mod))
+function mcgen(job::CompilerJob, mod::LLVM.Module, f::LLVM.Function)
+    tm = machine(job.cap, triple(mod))
 
     InitializeNVPTXAsmPrinter()
     return String(emit(tm, mod, LLVM.API.LLVMAssemblyFile))
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index 7e38e155..f4f3302b 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -1,10 +1,10 @@
 # LLVM IR optimization
 
-function optimize!(ctx::CompilerContext, mod::LLVM.Module, entry::LLVM.Function)
-    tm = machine(ctx.cap, triple(mod))
+function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
+    tm = machine(job.cap, triple(mod))
 
-    if ctx.kernel
-        entry = promote_kernel!(ctx, mod, entry)
+    if job.kernel
+        entry = promote_kernel!(job, mod, entry)
     end
 
     function initialize!(pm)
@@ -13,8 +13,8 @@ function optimize!(ctx::CompilerContext, mod::LLVM.Module, entry::LLVM.Function)
         internalize!(pm, [LLVM.name(entry)])
     end
 
-    global global_ctx
-    global_ctx = ctx
+    global current_job
+    current_job = job
 
     # Julia-specific optimizations
     #
@@ -118,8 +118,8 @@ end
 
 # promote a function to a kernel
 # FIXME: sig vs tt (code_llvm vs cufunction)
-function promote_kernel!(ctx::CompilerContext, mod::LLVM.Module, entry_f::LLVM.Function)
-    kernel = wrap_entry!(ctx, mod, entry_f)
+function promote_kernel!(job::CompilerJob, mod::LLVM.Module, entry_f::LLVM.Function)
+    kernel = wrap_entry!(job, mod, entry_f)
 
     # property annotations
     # TODO: belongs in irgen? doesn't maxntidx doesn't appear in ptx code?
@@ -130,16 +130,16 @@ function promote_kernel!(ctx::CompilerContext, mod::LLVM.Module, entry_f::LLVM.F
     append!(annotations, [MDString("kernel"), ConstantInt(Int32(1), JuliaContext())])
 
     ## expected CTA sizes
-    if ctx.minthreads != nothing
-        bounds = CUDAdrv.CuDim3(ctx.minthreads)
+    if job.minthreads != nothing
+        bounds = CUDAdrv.CuDim3(job.minthreads)
         for dim in (:x, :y, :z)
             bound = getfield(bounds, dim)
             append!(annotations, [MDString("reqntid$dim"),
                                   ConstantInt(Int32(bound), JuliaContext())])
         end
     end
-    if ctx.maxthreads != nothing
-        bounds = CUDAdrv.CuDim3(ctx.maxthreads)
+    if job.maxthreads != nothing
+        bounds = CUDAdrv.CuDim3(job.maxthreads)
         for dim in (:x, :y, :z)
             bound = getfield(bounds, dim)
             append!(annotations, [MDString("maxntid$dim"),
@@ -147,14 +147,14 @@ function promote_kernel!(ctx::CompilerContext, mod::LLVM.Module, entry_f::LLVM.F
         end
     end
 
-    if ctx.blocks_per_sm != nothing
+    if job.blocks_per_sm != nothing
         append!(annotations, [MDString("minctasm"),
-                              ConstantInt(Int32(ctx.blocks_per_sm), JuliaContext())])
+                              ConstantInt(Int32(job.blocks_per_sm), JuliaContext())])
     end
 
-    if ctx.maxregs != nothing
+    if job.maxregs != nothing
         append!(annotations, [MDString("maxnreg"),
-                              ConstantInt(Int32(ctx.maxregs), JuliaContext())])
+                              ConstantInt(Int32(job.maxregs), JuliaContext())])
     end
 
 
@@ -178,12 +178,12 @@ function wrapper_type(julia_t::Type, codegen_t::LLVMType)::LLVMType
 end
 
 # generate a kernel wrapper to fix & improve argument passing
-function wrap_entry!(ctx::CompilerContext, mod::LLVM.Module, entry_f::LLVM.Function)
+function wrap_entry!(job::CompilerJob, mod::LLVM.Module, entry_f::LLVM.Function)
     entry_ft = eltype(llvmtype(entry_f)::LLVM.PointerType)::LLVM.FunctionType
-    @compiler_assert return_type(entry_ft) == LLVM.VoidType(JuliaContext()) ctx
+    @compiler_assert return_type(entry_ft) == LLVM.VoidType(JuliaContext()) job
 
     # filter out ghost types, which don't occur in the LLVM function signatures
-    sig = Base.signature_type(ctx.f, ctx.tt)::Type
+    sig = Base.signature_type(job.f, job.tt)::Type
     julia_types = Type[]
     for dt::Type in sig.parameters
         if !isghosttype(dt)
@@ -216,8 +216,8 @@ function wrap_entry!(ctx::CompilerContext, mod::LLVM.Module, entry_f::LLVM.Funct
             if codegen_t != wrapper_t
                 # the wrapper argument doesn't match the kernel parameter type.
                 # this only happens when codegen wants to pass a pointer.
-                @compiler_assert isa(codegen_t, LLVM.PointerType) ctx
-                @compiler_assert eltype(codegen_t) == wrapper_t ctx
+                @compiler_assert isa(codegen_t, LLVM.PointerType) job
+                @compiler_assert eltype(codegen_t) == wrapper_t job
 
                 # copy the argument value to a stack slot, and reference it.
                 ptr = alloca!(builder, wrapper_t)
@@ -299,7 +299,7 @@ end
 # such IR is hard to clean-up, so we probably will need to have the GC lowering pass emit
 # lower-level intrinsics which then can be lowered to architecture-specific code.
 function lower_gc_frame!(fun::LLVM.Function)
-    ctx = global_ctx::CompilerContext
+    job = current_job::CompilerJob
     mod = LLVM.parent(fun)
     changed = false
 
@@ -330,7 +330,7 @@ function lower_gc_frame!(fun::LLVM.Function)
             changed = true
         end
 
-        @compiler_assert isempty(uses(alloc_obj)) ctx
+        @compiler_assert isempty(uses(alloc_obj)) job
     end
 
     # we don't care about write barriers
@@ -343,7 +343,7 @@ function lower_gc_frame!(fun::LLVM.Function)
             changed = true
         end
 
-        @compiler_assert isempty(uses(barrier)) ctx
+        @compiler_assert isempty(uses(barrier)) job
     end
 
     return changed
@@ -357,7 +357,7 @@ end
 # TODO: maybe don't have Julia emit actual uses of the TLS, but use intrinsics instead,
 #       making it easier to remove or reimplement that functionality here.
 function lower_ptls!(mod::LLVM.Module)
-    ctx = global_ctx::CompilerContext
+    job = current_job::CompilerJob
     changed = false
 
     if haskey(functions(mod), "julia.ptls_states")
@@ -372,7 +372,7 @@ function lower_ptls!(mod::LLVM.Module)
             changed = true
         end
 
-        @compiler_assert isempty(uses(ptls_getter)) ctx
+        @compiler_assert isempty(uses(ptls_getter)) job
      end
 
     return changed
diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl
index 58b6c9a5..d99dcccd 100644
--- a/src/compiler/rtlib.jl
+++ b/src/compiler/rtlib.jl
@@ -1,6 +1,6 @@
 # compiler support for working with run-time libraries
 
-function link_library!(ctx::CompilerContext, mod::LLVM.Module, lib::LLVM.Module)
+function link_library!(job::CompilerJob, mod::LLVM.Module, lib::LLVM.Module)
     # linking is destructive, so copy the library
     lib = LLVM.Module(lib)
 
@@ -61,12 +61,12 @@ function load_libdevice(cap)
     end
 end
 
-function link_libdevice!(ctx::CompilerContext, mod::LLVM.Module, lib::LLVM.Module)
+function link_libdevice!(job::CompilerJob, mod::LLVM.Module, lib::LLVM.Module)
     # override libdevice's triple and datalayout to avoid warnings
     triple!(lib, triple(mod))
     datalayout!(lib, datalayout(mod))
 
-    link_library!(ctx, mod, lib)
+    link_library!(job, mod, lib)
 
     ModulePassManager() do pm
         push!(metadata(mod), "nvvm-reflect-ftz",
diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl
index 1c225f44..aa9fb682 100644
--- a/src/compiler/validation.jl
+++ b/src/compiler/validation.jl
@@ -1,19 +1,19 @@
 # validation of properties and code
 
-function check_method(ctx::CompilerContext)
-    isa(ctx.f, Core.Builtin) && throw(KernelError(ctx, "function is not a generic function"))
+function check_method(job::CompilerJob)
+    isa(job.f, Core.Builtin) && throw(KernelError(job, "function is not a generic function"))
 
     # get the method
-    ms = Base.methods(ctx.f, ctx.tt)
-    isempty(ms)   && throw(KernelError(ctx, "no method found"))
-    length(ms)!=1 && throw(KernelError(ctx, "no unique matching method"))
+    ms = Base.methods(job.f, job.tt)
+    isempty(ms)   && throw(KernelError(job, "no method found"))
+    length(ms)!=1 && throw(KernelError(job, "no unique matching method"))
     m = first(ms)
 
     # kernels can't return values
-    if ctx.kernel
-        rt = Base.return_types(ctx.f, ctx.tt)[1]
+    if job.kernel
+        rt = Base.return_types(job.f, job.tt)[1]
         if rt != Nothing
-            throw(KernelError(ctx, "kernel returns a value of type `$rt`",
+            throw(KernelError(job, "kernel returns a value of type `$rt`",
                 """Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
                    If the returned value is of type `Union{}`, your Julia code probably throws an exception.
                    Inspect the code with `@device_code_warntype` for more details."""))
@@ -23,10 +23,10 @@ function check_method(ctx::CompilerContext)
     return
 end
 
-function check_invocation(ctx::CompilerContext, entry::LLVM.Function)
+function check_invocation(job::CompilerJob, entry::LLVM.Function)
     # make sure any non-isbits arguments are unused
     real_arg_i = 0
-    sig = Base.signature_type(ctx.f, ctx.tt)::Type
+    sig = Base.signature_type(job.f, job.tt)::Type
     for (arg_i,dt) in enumerate(sig.parameters)
         isghosttype(dt) && continue
         real_arg_i += 1
@@ -34,7 +34,7 @@ function check_invocation(ctx::CompilerContext, entry::LLVM.Function)
         if !isbitstype(dt)
             param = parameters(entry)[real_arg_i]
             if !isempty(uses(param))
-                throw(KernelError(ctx, "passing and using non-bitstype argument",
+                throw(KernelError(job, "passing and using non-bitstype argument",
                     """Argument $arg_i to your kernel function is of type $dt.
                        That type is not isbits, and such arguments are only allowed when they are unused by the kernel."""))
             end
@@ -50,7 +50,7 @@ end
 const IRError = Tuple{String, StackTraces.StackTrace, Any} # kind, bt, meta
 
 struct InvalidIRError <: Exception
-    ctx::CompilerContext
+    job::CompilerJob
     errors::Vector{IRError}
 end
 
@@ -59,7 +59,7 @@ const UNKNOWN_FUNCTION = "call to an unknown function"
 const POINTER_FUNCTION = "call through a literal pointer"
 
 function Base.showerror(io::IO, err::InvalidIRError)
-    print(io, "InvalidIRError: compiling $(signature(err.ctx)) resulted in invalid LLVM IR")
+    print(io, "InvalidIRError: compiling $(signature(err.job)) resulted in invalid LLVM IR")
     for (kind, bt, meta) in err.errors
         print(io, "\nReason: unsupported $kind")
         if meta != nothing
@@ -72,28 +72,28 @@ function Base.showerror(io::IO, err::InvalidIRError)
     return
 end
 
-function check_ir(ctx, args...)
-    errors = check_ir!(ctx, IRError[], args...)
+function check_ir(job, args...)
+    errors = check_ir!(job, IRError[], args...)
     unique!(errors)
     if !isempty(errors)
-        throw(InvalidIRError(ctx, errors))
+        throw(InvalidIRError(job, errors))
     end
 
     return
 end
 
-function check_ir!(ctx, errors::Vector{IRError}, mod::LLVM.Module)
+function check_ir!(job, errors::Vector{IRError}, mod::LLVM.Module)
     for f in functions(mod)
-        check_ir!(ctx, errors, f)
+        check_ir!(job, errors, f)
     end
 
     return errors
 end
 
-function check_ir!(ctx, errors::Vector{IRError}, f::LLVM.Function)
+function check_ir!(job, errors::Vector{IRError}, f::LLVM.Function)
     for bb in blocks(f), inst in instructions(bb)
         if isa(inst, LLVM.CallInst)
-            check_ir!(ctx, errors, inst)
+            check_ir!(job, errors, inst)
         end
     end
 
@@ -111,7 +111,7 @@ const special_fns = (
 
 const libjulia = Ref{Ptr{Cvoid}}(C_NULL)
 
-function check_ir!(ctx, errors::Vector{IRError}, inst::LLVM.CallInst)
+function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst)
     dest = called_value(inst)
     if isa(dest, LLVM.Function)
         fn = LLVM.name(dest)
@@ -143,7 +143,7 @@ function check_ir!(ctx, errors::Vector{IRError}, inst::LLVM.CallInst)
         if occursin("inttoptr", string(dest))
             # extract the literal pointer
             ptr_arg = first(operands(dest))
-            @compiler_assert isa(ptr_arg, ConstantInt) ctx
+            @compiler_assert isa(ptr_arg, ConstantInt) job
             ptr_val = convert(Int, ptr_arg)
             ptr = Ptr{Cvoid}(ptr_val)
 
@@ -151,7 +151,7 @@ function check_ir!(ctx, errors::Vector{IRError}, inst::LLVM.CallInst)
             bt = backtrace(inst)
             frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0)
             if length(frames) >= 1
-                @compiler_assert length(frames) == 1 ctx frames=frames
+                @compiler_assert length(frames) == 1 job frames=frames
                 fn, file, line, linfo, fromC, inlined, ip = last(frames)
                 push!(errors, (POINTER_FUNCTION, bt, fn))
             else
diff --git a/src/reflection.jl b/src/reflection.jl
index 1b23388f..d92ae900 100644
--- a/src/reflection.jl
+++ b/src/reflection.jl
@@ -31,13 +31,13 @@ function code_llvm(io::IO, @nospecialize(func::Core.Function), @nospecialize(typ
                    dump_module::Bool=false, strip_ir_metadata::Bool=true,
                    kernel::Bool=false, kwargs...)
     tt = Base.to_tuple_type(types)
-    ctx = CompilerContext(func, tt, cap, kernel; kwargs...)
-    code_llvm(io, ctx; optimize=optimize, dump_module=dump_module,
+    job = CompilerJob(func, tt, cap, kernel; kwargs...)
+    code_llvm(io, job; optimize=optimize, dump_module=dump_module,
               strip_ir_metadata=strip_ir_metadata)
 end
-function code_llvm(io::IO, ctx::CompilerContext; optimize::Bool=true,
+function code_llvm(io::IO, job::CompilerJob; optimize::Bool=true,
                    dump_module::Bool=false, strip_ir_metadata::Bool=true)
-    ir, entry = compile(:llvm, ctx; hooks=false, optimize=optimize, strip=strip_ir_metadata)
+    ir, entry = compile(:llvm, job; hooks=false, optimize=optimize, strip=strip_ir_metadata)
     if dump_module
         show(io, ir)
     else
@@ -63,11 +63,11 @@ function code_ptx(io::IO, @nospecialize(func::Core.Function), @nospecialize(type
                   cap::VersionNumber=current_capability(), kernel::Bool=false,
                   strip_ir_metadata::Bool=true, kwargs...)
     tt = Base.to_tuple_type(types)
-    ctx = CompilerContext(func, tt, cap, kernel; kwargs...)
-    code_ptx(io, ctx; strip_ir_metadata=strip_ir_metadata)
+    job = CompilerJob(func, tt, cap, kernel; kwargs...)
+    code_ptx(io, job; strip_ir_metadata=strip_ir_metadata)
 end
-function code_ptx(io::IO, ctx::CompilerContext; strip_ir_metadata::Bool=true)
-    asm, _ = compile(:ptx, ctx; hooks=false, strip=strip_ir_metadata)
+function code_ptx(io::IO, job::CompilerJob; strip_ir_metadata::Bool=true)
+    asm, _ = compile(:ptx, job; hooks=false, strip=strip_ir_metadata)
     print(io, asm)
 end
 code_ptx(@nospecialize(func), @nospecialize(types); kwargs...) =
@@ -87,21 +87,21 @@ See also: [`@device_code_sass`](@ref)
 function code_sass(io::IO, @nospecialize(func::Core.Function), @nospecialize(types);
                    cap::VersionNumber=current_capability(), kernel::Bool=true, kwargs...)
     tt = Base.to_tuple_type(types)
-    ctx = CompilerContext(func, tt, cap, kernel; kwargs...)
-    code_sass(io, ctx)
+    job = CompilerJob(func, tt, cap, kernel; kwargs...)
+    code_sass(io, job)
 end
-function code_sass(io::IO, ctx::CompilerContext)
-    if !ctx.kernel
+function code_sass(io::IO, job::CompilerJob)
+    if !job.kernel
         error("Can only generate SASS code for kernel functions")
     end
     if ptxas === nothing || nvdisasm === nothing
         error("Your CUDA installation does not provide ptxas or nvdisasm, both of which are required for code_sass")
     end
 
-    ptx, _ = compile(:ptx, ctx; hooks=false)
+    ptx, _ = compile(:ptx, job; hooks=false)
 
     fn = tempname()
-    gpu = "sm_$(ctx.cap.major)$(ctx.cap.minor)"
+    gpu = "sm_$(job.cap.major)$(job.cap.minor)"
     # NOTE: this might not match what is being executed, due to the PTX->SASS conversion
     #       by the driver possibly not matching what `ptxas` (part of the toolkit) does.
     # TODO: see how `nvvp` extracts SASS code when doing PC sampling, and copy that.
@@ -142,9 +142,9 @@ function emit_hooked_compilation(inner_hook, ex...)
         empty!(CUDAnative.compilecache)
 
         local kernels = 0
-        function outer_hook(ctx)
+        function outer_hook(job)
             kernels += 1
-            $inner_hook(ctx; $(map(esc, user_kwargs)...))
+            $inner_hook(job; $(map(esc, user_kwargs)...))
         end
 
         if CUDAnative.compile_hook[] != nothing
@@ -184,8 +184,8 @@ See also: [`InteractiveUtils.@code_lowered`](@ref)
 macro device_code_lowered(ex...)
     quote
         buf = Any[]
-        function hook(ctx::CompilerContext)
-            append!(buf, code_lowered(ctx.f, ctx.tt))
+        function hook(job::CompilerJob)
+            append!(buf, code_lowered(job.f, job.tt))
         end
         $(emit_hooked_compilation(:hook, ex...))
         buf
@@ -203,8 +203,8 @@ See also: [`InteractiveUtils.@code_typed`](@ref)
 macro device_code_typed(ex...)
     quote
         buf = Any[]
-        function hook(ctx::CompilerContext)
-            append!(buf, code_typed(ctx.f, ctx.tt))
+        function hook(job::CompilerJob)
+            append!(buf, code_typed(job.f, job.tt))
         end
         $(emit_hooked_compilation(:hook, ex...))
         buf
@@ -220,8 +220,8 @@ Evaluates the expression `ex` and prints the result of
 See also: [`InteractiveUtils.@code_warntype`](@ref)
 """
 macro device_code_warntype(ex...)
-    function hook(ctx::CompilerContext; io::IO=stdout, kwargs...)
-        code_warntype(io, ctx.f, ctx.tt; kwargs...)
+    function hook(job::CompilerJob; io::IO=stdout, kwargs...)
+        code_warntype(io, job.f, job.tt; kwargs...)
     end
     emit_hooked_compilation(hook, ex...)
 end
@@ -236,7 +236,7 @@ to `io` for every compiled CUDA kernel. For other supported keywords, see
 See also: [`InteractiveUtils.@code_llvm`](@ref)
 """
 macro device_code_llvm(ex...)
-    hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_llvm(io, ctx; kwargs...)
+    hook(job::CompilerJob; io::IO=stdout, kwargs...) = code_llvm(io, job; kwargs...)
     emit_hooked_compilation(hook, ex...)
 end
 
@@ -248,7 +248,7 @@ for every compiled CUDA kernel. For other supported keywords, see
 [`CUDAnative.code_ptx`](@ref).
 """
 macro device_code_ptx(ex...)
-    hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_ptx(io, ctx; kwargs...)
+    hook(job::CompilerJob; io::IO=stdout, kwargs...) = code_ptx(io, job; kwargs...)
     emit_hooked_compilation(hook, ex...)
 end
 
@@ -260,7 +260,7 @@ Evaluates the expression `ex` and prints the result of [`CUDAnative.code_sass`](
 [`CUDAnative.code_sass`](@ref).
 """
 macro device_code_sass(ex...)
-    hook(ctx::CompilerContext; io::IO=stdout, kwargs...) = code_sass(io, ctx; kwargs...)
+    hook(job::CompilerJob; io::IO=stdout, kwargs...) = code_sass(io, job; kwargs...)
     emit_hooked_compilation(hook, ex...)
 end
 
@@ -272,34 +272,34 @@ Evaluates the expression `ex` and dumps all intermediate forms of code to the di
 """
 macro device_code(ex...)
     only(xs) = (@assert length(xs) == 1; first(xs))
-    function hook(ctx::CompilerContext; dir::AbstractString)
-        fn = "$(typeof(ctx.f).name.mt.name)_$(globalUnique+1)"
+    function hook(job::CompilerJob; dir::AbstractString)
+        fn = "$(typeof(job.f).name.mt.name)_$(globalUnique+1)"
         mkpath(dir)
 
         open(joinpath(dir, "$fn.lowered.jl"), "w") do io
-            code = only(code_lowered(ctx.f, ctx.tt))
+            code = only(code_lowered(job.f, job.tt))
             println(io, code)
         end
 
         open(joinpath(dir, "$fn.typed.jl"), "w") do io
-            code = only(code_typed(ctx.f, ctx.tt))
+            code = only(code_typed(job.f, job.tt))
             println(io, code)
         end
 
         open(joinpath(dir, "$fn.unopt.ll"), "w") do io
-            code_llvm(io, ctx; dump_module=true, strip_ir_metadata=false, optimize=false)
+            code_llvm(io, job; dump_module=true, strip_ir_metadata=false, optimize=false)
         end
 
         open(joinpath(dir, "$fn.opt.ll"), "w") do io
-            code_llvm(io, ctx; dump_module=true, strip_ir_metadata=false)
+            code_llvm(io, job; dump_module=true, strip_ir_metadata=false)
         end
 
         open(joinpath(dir, "$fn.ptx"), "w") do io
-            code_ptx(io, ctx)
+            code_ptx(io, job)
         end
 
         open(joinpath(dir, "$fn.sass"), "w") do io
-            code_sass(io, ctx)
+            code_sass(io, job)
         end
     end
     emit_hooked_compilation(hook, ex...)

From 9dabf73bd78ad4e022560422c4e9a253e42346da Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 Mar 2019 13:50:04 +0100
Subject: [PATCH 23/34] Bugfix.

---
 src/execution.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/execution.jl b/src/execution.jl
index 428f586a..afe3ca18 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -425,7 +425,7 @@ end
 
     append!(ex.args, (quote
         #GC.@preserve $(converted_args...) begin
-            launch(f, blocks, threads, shmem, stream, ($(arg_ptrs...),))
+            launch(f, blocks, threads, shmem, stream, $(arg_ptrs...))
         #end
     end).args)
 

From e2fefdaadee486bad676751491eee883db6c07c9 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 Mar 2019 14:31:04 +0100
Subject: [PATCH 24/34] Iterative compilation to deal with nested recursion.

---
 Project.toml           |   1 +
 src/CUDAnative.jl      |   1 +
 src/compiler/driver.jl | 116 ++++++++++++++++++++++++-----------------
 3 files changed, 70 insertions(+), 48 deletions(-)

diff --git a/Project.toml b/Project.toml
index a62a301c..fd44b603 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
 CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl
index 018729ae..fab16f49 100644
--- a/src/CUDAnative.jl
+++ b/src/CUDAnative.jl
@@ -6,6 +6,7 @@ using LLVM
 using LLVM.Interop
 
 using Adapt
+using DataStructures
 
 using Pkg
 using Libdl
diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 9687b050..d6e6544f 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -62,7 +62,7 @@ function compile(to::Symbol, job::CompilerJob;
 
     ## LLVM IR
 
-    ir, entry = irgen(job, linfo, world)
+    ir, kernel = irgen(job, linfo, world)
 
     need_library(lib) = any(f -> isdeclaration(f) &&
                                  intrinsic_id(f) == 0 &&
@@ -76,7 +76,7 @@ function compile(to::Symbol, job::CompilerJob;
 
     # optimize the IR
     if optimize
-        entry = optimize!(job, ir, entry)
+        kernel = optimize!(job, ir, kernel)
     end
 
     runtime = load_runtime(job.cap)
@@ -90,68 +90,88 @@ function compile(to::Symbol, job::CompilerJob;
         strip_debuginfo!(ir)
     end
 
+    kernel_fn = LLVM.name(kernel)
+    kernel_ft = eltype(llvmtype(kernel))
+
+    to == :llvm && return ir, kernel
+
 
     ## dynamic parallelism
 
+    kernels = OrderedDict{CompilerJob, String}(job => kernel_fn)
+
     if haskey(functions(ir), "cudanativeCompileKernel")
-        f = functions(ir)["cudanativeCompileKernel"]
-
-        # find dynamic kernel invocations
-        # TODO: recover this information earlier, from the Julia IR
-        worklist = Dict{Tuple{Core.Function,Type}, Vector{LLVM.CallInst}}()
-        for use in uses(f)
-            # decode the call
-            call = user(use)::LLVM.CallInst
-            ops = collect(operands(call))[1:2]
-            ## addrspacecast
-            ops = LLVM.Value[first(operands(val)) for val in ops]
-            ## inttoptr
-            ops = ConstantInt[first(operands(val)) for val in ops]
-            ## integer constants
-            ops = convert.(Int, ops)
-            ## actual pointer values
-            ops = Ptr{Any}.(ops)
-
-            dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops)
-            calls = get!(worklist, (dyn_f, dyn_tt), LLVM.CallInst[])
-            push!(calls, call)
-        end
+        dyn_maker = functions(ir)["cudanativeCompileKernel"]
+
+        # iterative compilation (non-recursive)
+        changed = true
+        while changed
+            changed = false
+
+            # find dynamic kernel invocations
+            # TODO: recover this information earlier, from the Julia IR
+            worklist = MultiDict{CompilerJob, LLVM.CallInst}()
+            for use in uses(dyn_maker)
+                # decode the call
+                call = user(use)::LLVM.CallInst
+                ops = collect(operands(call))[1:2]
+                ## addrspacecast
+                ops = LLVM.Value[first(operands(val)) for val in ops]
+                ## inttoptr
+                ops = ConstantInt[first(operands(val)) for val in ops]
+                ## integer constants
+                ops = convert.(Int, ops)
+                ## actual pointer values
+                ops = Ptr{Any}.(ops)
+
+                dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops)
+                dyn_job = CompilerJob(dyn_f, dyn_tt, job.cap, #=kernel=# true)
+                push!(worklist, dyn_job => call)
+            end
 
-        # compile and link
-        for (dyn_f, dyn_tt) in keys(worklist)
-            dyn_ctx = CompilerJob(dyn_f, dyn_tt, job.cap, true)
-            dyn_ir, dyn_entry =
-                compile(:llvm, dyn_ctx; hooks=false, optimize=optimize, strip=strip)
-
-            dyn_fn = LLVM.name(dyn_entry)
-            link!(ir, dyn_ir)
-            dyn_ir = nothing
-            dyn_entry = functions(ir)[dyn_fn]
-
-            # insert a call everywhere the kernel is used
-            for call in worklist[(dyn_f,dyn_tt)]
-                replace_uses!(call, dyn_entry)
-                unsafe_delete!(LLVM.parent(call), call)
+            # compile and link
+            for dyn_job in keys(worklist)
+                # cached compilation
+                dyn_kernel_fn = get!(kernels, dyn_job) do
+                    dyn_ir, dyn_kernel = compile(:llvm, dyn_job; hooks=false,
+                                                 optimize=optimize, strip=strip)
+                    dyn_kernel_fn = LLVM.name(dyn_kernel)
+                    dyn_kernel_ft = eltype(llvmtype(dyn_kernel))
+                    link!(ir, dyn_ir)
+                    changed = true
+                    dyn_kernel_fn
+                end
+                dyn_kernel = functions(ir)[dyn_kernel_fn]
+
+                # insert a pointer to the function everywhere the kernel is used
+                T_ptr = convert(LLVMType, Ptr{Cvoid})
+                for call in worklist[dyn_job]
+                    Builder(JuliaContext()) do builder
+                        position!(builder, call)
+                        fptr = ptrtoint!(builder, dyn_kernel, T_ptr)
+                        replace_uses!(call, fptr)
+                    end
+                    unsafe_delete!(LLVM.parent(call), call)
+                end
             end
         end
 
-        @compiler_assert isempty(uses(f)) job
-        unsafe_delete!(ir, f)
+        # all dynamic launches should have been resolved
+        @compiler_assert isempty(uses(dyn_maker)) job
+        unsafe_delete!(ir, dyn_maker)
     end
 
-    to == :llvm && return ir, entry
-
 
     ## PTX machine code
 
     prepare_execution!(job, ir)
 
-    check_invocation(job, entry)
+    check_invocation(job, kernel)
     check_ir(job, ir)
 
-    asm = mcgen(job, ir, entry)
+    asm = mcgen(job, ir, kernel)
 
-    to == :ptx && return asm, LLVM.name(entry)
+    to == :ptx && return asm, kernel_fn
 
 
     ## CUDA objects
@@ -167,11 +187,11 @@ function compile(to::Symbol, job::CompilerJob;
     # link the CUDA device library
     linker = CUDAdrv.CuLink(jit_options)
     CUDAdrv.add_file!(linker, libcudadevrt, CUDAdrv.LIBRARY)
-    CUDAdrv.add_data!(linker, LLVM.name(entry), asm)
+    CUDAdrv.add_data!(linker, kernel_fn, asm)
     image = CUDAdrv.complete(linker)
 
     cuda_mod = CuModule(image, jit_options)
-    cuda_fun = CuFunction(cuda_mod, LLVM.name(entry))
+    cuda_fun = CuFunction(cuda_mod, kernel_fn)
 
     to == :cuda && return cuda_fun, cuda_mod
 

From ee21fbc90693f9010d2cd32fe15bb26ef70644c3 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 Mar 2019 15:39:19 +0100
Subject: [PATCH 25/34] Call into Julia's GC pass for IR cleanup purposes.

Without it, LLVM+asserts will complain.
---
 src/compiler/optim.jl | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index f4f3302b..eae9ae3f 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -51,9 +51,18 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
 
         ModulePassManager() do pm
             initialize!(pm)
+
+            # lower intrinsics
             add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!))
             aggressive_dce!(pm) # remove dead uses of ptls
             add!(pm, ModulePass("LowerPTLS", lower_ptls!))
+
+            # the Julia GC lowering pass also has some clean-up that is required
+            function LLVMAddLateLowerGCFramePass(PM::LLVM.API.LLVMPassManagerRef)
+                LLVM.@apicall(:LLVMExtraAddLateLowerGCFramePass,Cvoid,(LLVM.API.LLVMPassManagerRef,), PM)
+            end
+            LLVMAddLateLowerGCFramePass(LLVM.ref(pm))
+
             run!(pm, mod)
         end
     end

From 82e6afc347ff7cddc2b76bdd8afe0b8d968d1f35 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 Mar 2019 15:30:06 +0100
Subject: [PATCH 26/34] Document 265 issue.

---
 src/execution.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/execution.jl b/src/execution.jl
index afe3ca18..bfbcb932 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -453,6 +453,7 @@ No keyword arguments are supported.
     else
         # we can't compile here, so drop a marker which will get picked up during compilation
         quote
+            # TODO: add an edge to this method instance to support method redefinitions
             fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid},
                          (Any, Any), f, tt)
             DeviceKernel{f,tt}(fptr)

From 1aba961ba34d70ad0d0bdd30ae3158fac6668975 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 Mar 2019 15:35:30 +0100
Subject: [PATCH 27/34] Add tests.

---
 test/device/execution.jl | 92 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/test/device/execution.jl b/test/device/execution.jl
index 3d37c040..86a7d8e5 100644
--- a/test/device/execution.jl
+++ b/test/device/execution.jl
@@ -807,4 +807,96 @@ end
 
 ############################################################################################
 
+@testset "dynamic parallelism" begin
+
+@testset "basic usage" begin
+    function hello()
+        @cuprintf("Hello, ")
+        @cuda dynamic=true world()
+        return
+    end
+
+    @eval function world()
+        @cuprintf("World!")
+        return
+    end
+
+    _, out = @grab_output begin
+        @cuda hello()
+        synchronize()
+    end
+    @test out == "Hello, World!"
+end
+
+@testset "argument passing" begin
+    function kernel(a, b, c)
+        @cuprintf("%ld %ld %ld", Int64(a), Int64(b), Int64(c))
+        return
+    end
+
+    for args in ((Int16(1), Int32(2), Int64(3)),    # padding
+                 (Int32(1), Int32(2), Int32(3)),    # no padding, equal size
+                 (Int64(1), Int32(2), Int16(3)),    # no padding, inequal size
+                 (Int16(1), Int64(2), Int32(3)))    # mixed
+        _, out = @grab_output begin
+            @cuda kernel(args...)
+            synchronize()
+        end
+        @test out == "1 2 3"
+    end
+end
+
+@testset "self-recursion" begin
+    @eval function kernel(x::Bool)
+        if x
+            @cuprintf("recurse ")
+            @cuda dynamic=true kernel(false)
+        else
+            @cuprintf("stop")
+        end
+       return
+    end
+
+    _, out = @grab_output begin
+        @cuda kernel(true)
+        synchronize()
+    end
+    @test out == "recurse stop"
+end
+
+@testset "deep recursion" begin
+    @eval function kernel_a(x::Bool)
+        @cuprintf("a ")
+        @cuda dynamic=true kernel_b(x)
+        return
+    end
+
+    @eval function kernel_b(x::Bool)
+        @cuprintf("b ")
+        @cuda dynamic=true kernel_c(x)
+        return
+    end
+
+    @eval function kernel_c(x::Bool)
+        @cuprintf("c ")
+        if x
+            @cuprintf("recurse ")
+            @cuda dynamic=true kernel_a(false)
+        else
+            @cuprintf("stop")
+        end
+        return
+    end
+
+    _, out = @grab_output begin
+        @cuda kernel_a(true)
+        synchronize()
+    end
+    @test out == "a b c recurse a b c stop"
+end
+
+end
+
+############################################################################################
+
 end

From b685fc8866aa37571c83936e807038399c279de8 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 21 Mar 2019 16:23:03 +0100
Subject: [PATCH 28/34] Fixes to runtime library generation.

---
 src/compiler/driver.jl | 26 ++++++++++++++++----------
 src/compiler/rtlib.jl  |  3 ++-
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index d6e6544f..cff5e345 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -14,19 +14,22 @@ set, specialized code generation and optimization for kernel functions is enable
 
 The following keyword arguments are supported:
 - `hooks`: enable compiler hooks that drive reflection functions (default: true)
+- `libraries`: link auxiliary bitcode libraries that may be required (default: true)
 - `optimize`: optimize the code (default: true)
 - `strip`: strip non-functional metadata and debug information  (default: false)
 
 Other keyword arguments can be found in the documentation of [`cufunction`](@ref).
 """
 compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt),
-        kernel::Bool=true; hooks::Bool=true, optimize::Bool=true, strip::Bool=false,
+        kernel::Bool=true; hooks::Bool=true, libraries::Bool=true,
+        optimize::Bool=true, strip::Bool=false,
         kwargs...) =
     compile(to, CompilerJob(f, tt, cap, kernel; kwargs...);
-            hooks=hooks, optimize=optimize, strip=strip)
+            hooks=hooks, libraries=libraries, optimize=optimize, strip=strip)
 
 function compile(to::Symbol, job::CompilerJob;
-                 hooks::Bool=true, optimize::Bool=true, strip::Bool=false)
+                 hooks::Bool=true, libraries::Bool=true,
+                 optimize::Bool=true, strip::Bool=false)
     @debug "(Re)compiling function" job
 
     if hooks && compile_hook[] != nothing
@@ -69,19 +72,22 @@ function compile(to::Symbol, job::CompilerJob;
                                  haskey(functions(lib), LLVM.name(f)),
                             functions(ir))
 
-    libdevice = load_libdevice(job.cap)
-    if need_library(libdevice)
-        link_libdevice!(job, ir, libdevice)
+    if libraries
+        libdevice = load_libdevice(job.cap)
+        if need_library(libdevice)
+            link_libdevice!(job, ir, libdevice)
+        end
     end
 
-    # optimize the IR
     if optimize
         kernel = optimize!(job, ir, kernel)
     end
 
-    runtime = load_runtime(job.cap)
-    if need_library(runtime)
-        link_library!(job, ir, runtime)
+    if libraries
+        runtime = load_runtime(job.cap)
+        if need_library(runtime)
+            link_library!(job, ir, runtime)
+        end
     end
 
     verify(ir)
diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl
index d99dcccd..5a9f4556 100644
--- a/src/compiler/rtlib.jl
+++ b/src/compiler/rtlib.jl
@@ -124,7 +124,8 @@ end
 
 function emit_function!(mod, cap, f, types, name)
     tt = Base.to_tuple_type(types)
-    new_mod, entry = compile(:llvm, cap, f, tt, #=kernel=# false; hooks=false)
+    new_mod, entry = compile(:llvm, cap, f, tt, #=kernel=# false;
+                             hooks=false, libraries=false)
     LLVM.name!(entry, name)
     link!(mod, new_mod)
 end

From 91ed41d942ad7f17670ecb52a880c363b0b4d076 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 22 Mar 2019 09:40:05 +0100
Subject: [PATCH 29/34] Simplify delayed compilation markers.

---
 src/compiler/driver.jl | 15 ++++-----------
 src/execution.jl       | 15 +++++++++++----
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index cff5e345..3ca05945 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -120,17 +120,10 @@ function compile(to::Symbol, job::CompilerJob;
             for use in uses(dyn_maker)
                 # decode the call
                 call = user(use)::LLVM.CallInst
-                ops = collect(operands(call))[1:2]
-                ## addrspacecast
-                ops = LLVM.Value[first(operands(val)) for val in ops]
-                ## inttoptr
-                ops = ConstantInt[first(operands(val)) for val in ops]
-                ## integer constants
-                ops = convert.(Int, ops)
-                ## actual pointer values
-                ops = Ptr{Any}.(ops)
-
-                dyn_f, dyn_tt = unsafe_pointer_to_objref.(ops)
+                id = convert(Int, first(operands(call)))
+
+                global delayed_cufunctions
+                dyn_f, dyn_tt = delayed_cufunctions[id]
                 dyn_job = CompilerJob(dyn_f, dyn_tt, job.cap, #=kernel=# true)
                 push!(worklist, dyn_job => call)
             end
diff --git a/src/execution.jl b/src/execution.jl
index bfbcb932..7c6f861b 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -443,7 +443,11 @@ a callable kernel object. Device-side equivalent of [`CUDAnative.cufunction`](@r
 
 No keyword arguments are supported.
 """
-@generated function dynamic_cufunction(f::Core.Function, tt::Type=Tuple{})
+@inline dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) =
+    delayed_cufunction(Val(f), Val(tt))
+
+const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}()
+@generated function delayed_cufunction(::Val{f}, ::Val{tt}) where {f,tt}
     if sizeof(f) > 0
         Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures")
         quote
@@ -451,11 +455,14 @@ No keyword arguments are supported.
             DeviceKernel{f,tt}(C_NULL)
         end
     else
-        # we can't compile here, so drop a marker which will get picked up during compilation
+        global delayed_cufunctions
+        push!(delayed_cufunctions, (f,tt))
+        id = length(delayed_cufunctions)
+
+        # drop a marker which will get picked up during compilation
         quote
             # TODO: add an edge to this method instance to support method redefinitions
-            fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid},
-                         (Any, Any), f, tt)
+            fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), $id)
             DeviceKernel{f,tt}(fptr)
         end
     end

From d4d6aca29cf118a9c24ac8e97b82abdb998a7d0a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 22 Mar 2019 10:01:06 +0100
Subject: [PATCH 30/34] Allow and test anonymous functions and closures.

---
 src/execution.jl         | 23 ++++++++---------------
 test/device/execution.jl | 31 +++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index 7c6f861b..12011364 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -448,23 +448,16 @@ No keyword arguments are supported.
 
 const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}()
 @generated function delayed_cufunction(::Val{f}, ::Val{tt}) where {f,tt}
-    if sizeof(f) > 0
-        Core.println(Core.stderr, "ERROR: @cuda dynamic parallelism does not support closures")
-        quote
-            trap()
-            DeviceKernel{f,tt}(C_NULL)
-        end
-    else
-        global delayed_cufunctions
-        push!(delayed_cufunctions, (f,tt))
-        id = length(delayed_cufunctions)
+    global delayed_cufunctions
+    push!(delayed_cufunctions, (f,tt))
+    id = length(delayed_cufunctions)
 
+    quote
         # drop a marker which will get picked up during compilation
-        quote
-            # TODO: add an edge to this method instance to support method redefinitions
-            fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), $id)
-            DeviceKernel{f,tt}(fptr)
-        end
+        # TODO: add an edge to this method instance to support method redefinitions
+        fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), $id)
+
+        DeviceKernel{f,tt}(fptr)
     end
 end
 
diff --git a/test/device/execution.jl b/test/device/execution.jl
index 86a7d8e5..edc8c107 100644
--- a/test/device/execution.jl
+++ b/test/device/execution.jl
@@ -828,6 +828,37 @@ end
     @test out == "Hello, World!"
 end
 
+@testset "anonymous functions" begin
+    function hello()
+        @cuprintf("Hello, ")
+        world = () -> (@cuprintf("World!"); nothing)
+        @cuda dynamic=true world()
+        return
+    end
+
+    _, out = @grab_output begin
+        @cuda hello()
+        synchronize()
+    end
+    @test out == "Hello, World!"
+end
+
+@testset "closures" begin
+    function hello()
+        x = 1
+        @cuprintf("Hello, ")
+        world = () -> (@cuprintf("World %ld!", x); nothing)
+        @cuda dynamic=true world()
+        return
+    end
+
+    _, out = @grab_output begin
+        @cuda hello()
+        synchronize()
+    end
+    @test out == "Hello, World 1!"
+end
+
 @testset "argument passing" begin
     function kernel(a, b, c)
         @cuprintf("%ld %ld %ld", Int64(a), Int64(b), Int64(c))

From a2a07a6e1880d63d3262eff1075204367499c478 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 22 Mar 2019 10:39:42 +0100
Subject: [PATCH 31/34] Add wrappers for older versions of Julia.

---
 src/compiler/optim.jl           |  9 +++--
 src/device/cuda/libcudadevrt.jl | 72 +++++++++++++++++++++++++++++----
 src/execution.jl                | 17 ++++++--
 3 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index eae9ae3f..887078fb 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -58,10 +58,13 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
             add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 
             # the Julia GC lowering pass also has some clean-up that is required
-            function LLVMAddLateLowerGCFramePass(PM::LLVM.API.LLVMPassManagerRef)
-                LLVM.@apicall(:LLVMExtraAddLateLowerGCFramePass,Cvoid,(LLVM.API.LLVMPassManagerRef,), PM)
+            if VERSION >= v"1.2.0-DEV.520"
+                # TODO: move this to LLVM.jl
+                function LLVMAddLateLowerGCFramePass(PM::LLVM.API.LLVMPassManagerRef)
+                    LLVM.@apicall(:LLVMExtraAddLateLowerGCFramePass,Cvoid,(LLVM.API.LLVMPassManagerRef,), PM)
+                end
+                LLVMAddLateLowerGCFramePass(LLVM.ref(pm))
             end
-            LLVMAddLateLowerGCFramePass(LLVM.ref(pm))
 
             run!(pm, mod)
         end
diff --git a/src/device/cuda/libcudadevrt.jl b/src/device/cuda/libcudadevrt.jl
index 59f7c969..03ca74c8 100644
--- a/src/device/cuda/libcudadevrt.jl
+++ b/src/device/cuda/libcudadevrt.jl
@@ -9,6 +9,25 @@ import CUDAdrv: CuDim3, CuStream_t
 const cudaError_t = Cint
 const cudaStream_t = CUDAdrv.CuStream_t
 
+if VERSION >= v"1.2.0-DEV.512"
+    @inline cudaLaunchDevice(buf::Ptr{Cvoid}, stream::CuStream) =
+        ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
+              (Ptr{Cvoid}, cudaStream_t),
+              buf, stream)
+else
+    import Base.Sys: WORD_SIZE
+    # declare i32 @cudaLaunchDeviceV2(i8*, %struct.CUstream_st*)
+    @eval @inline cudaLaunchDevice(buf::Ptr{Cvoid}, stream::CuStream) =
+            Base.llvmcall(
+                ( "declare i32 @cudaLaunchDeviceV2(i8*, i8*)",
+                 $"%buf = inttoptr i$WORD_SIZE %0 to i8*
+                   %stream = inttoptr i$WORD_SIZE %1 to i8*
+                   %rv = call i32 @cudaLaunchDeviceV2(i8* %buf, i8* %stream)
+                   ret i32 %rv"), cudaError_t,
+                Tuple{Ptr{Cvoid}, cudaStream_t},
+                buf, Base.unsafe_convert(cudaStream_t, stream))
+end
+
 # device-side counterpart of CUDAdrv.launch
 @inline function launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim,
                         shmem::Int, stream::CuStream,
@@ -17,21 +36,48 @@ const cudaStream_t = CUDAdrv.CuStream_t
     threads = CuDim3(threads)
 
     buf = parameter_buffer(f, blocks, threads, shmem, args...)
-
-    ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
-          (Ptr{Cvoid}, cudaStream_t),
-          buf, stream)
+    cudaLaunchDevice(buf, stream)
 
     return
 end
 
+if VERSION >= v"1.2.0-DEV.512"
+    @inline cudaGetParameterBuffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, shmem::Integer) =
+        ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
+                            (Ptr{Cvoid}, CuDim3, CuDim3, Cuint),
+                            f, blocks, threads, shmem)
+else
+    @inline cudaGetParameterBuffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, shmem::Integer) =
+        cudaGetParameterBuffer(f,
+                               blocks.x, blocks.y, blocks.z,
+                               threads.x, threads.y, threads.z,
+                               convert(Cuint, shmem))
+    # declare i8* @cudaGetParameterBufferV2(i8*, %struct.dim3, %struct.dim3, i32)
+    @eval @inline cudaGetParameterBuffer(f::Ptr{Cvoid},
+                                         blocks_x::Cuint, blocks_y::Cuint, blocks_z::Cuint,
+                                         threads_x::Cuint, threads_y::Cuint, threads_z::Cuint,
+                                         shmem::Cuint) =
+            Base.llvmcall(
+                ( "declare i8* @cudaGetParameterBufferV2(i8*, {i32,i32,i32}, {i32,i32,i32}, i32)",
+                 $"%f = inttoptr i$WORD_SIZE %0 to i8*
+                   %blocks.x = insertvalue { i32, i32, i32 } undef, i32 %1, 0
+                   %blocks.y = insertvalue { i32, i32, i32 } %blocks.x, i32 %2, 1
+                   %blocks.z = insertvalue { i32, i32, i32 } %blocks.y, i32 %3, 2
+                   %threads.x = insertvalue { i32, i32, i32 } undef, i32 %4, 0
+                   %threads.y = insertvalue { i32, i32, i32 } %threads.x, i32 %5, 1
+                   %threads.z = insertvalue { i32, i32, i32 } %threads.y, i32 %6, 2
+                   %rv = call i8* @cudaGetParameterBufferV2(i8* %f, {i32,i32,i32} %blocks.z, {i32,i32,i32} %threads.z, i32 %7)
+                   %buf = ptrtoint i8* %rv to i$WORD_SIZE
+                   ret i$WORD_SIZE %buf"), Ptr{Cvoid},
+                Tuple{Ptr{Cvoid}, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint},
+                f, blocks_x, blocks_y, blocks_z, threads_x, threads_y, threads_z, shmem)
+end
+
 @generated function parameter_buffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3,
                                      shmem::Int, args...)
     # allocate a buffer
     ex = quote
-        buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
-                    (Ptr{Cvoid}, CuDim3, CuDim3, Cuint),
-                    f, blocks, threads, shmem)
+        buf = cudaGetParameterBuffer(f, blocks, threads, shmem)
     end
 
     # store the parameters
@@ -55,6 +101,16 @@ end
     return ex
 end
 
+if VERSION >= v"1.2.0-DEV.512"
+    @inline synchronize() = ccall("extern cudaDeviceSynchronize", llvmcall, Cint, ())
+else
+    @eval @inline synchronize() =
+            Base.llvmcall(
+                ("declare i32 @cudaDeviceSynchronize()",
+                 "%rv = call i32 @cudaDeviceSynchronize()
+                  ret i32 %rv"), cudaError_t, Tuple{})
+end
+
 """
     synchronize()
 
@@ -64,4 +120,4 @@ and should not be called from the host.
 `synchronize` acts as a synchronization point for
 child grids in the context of dynamic parallelism.
 """
-@inline synchronize() = ccall("extern cudaDeviceSynchronize", llvmcall, Cint, ())
+synchronize
diff --git a/src/execution.jl b/src/execution.jl
index 12011364..28fb485e 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -446,6 +446,19 @@ No keyword arguments are supported.
 @inline dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) =
     delayed_cufunction(Val(f), Val(tt))
 
+# marker function that will get picked up during compilation
+if VERSION >= v"1.2.0-DEV.512"
+    @inline cudanativeCompileKernel(id::Int) =
+        ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), id)
+else
+    import Base.Sys: WORD_SIZE
+    @eval @inline cudanativeCompileKernel(id::Int) =
+        Base.llvmcall(
+            ($"declare i$WORD_SIZE @cudanativeCompileKernel(i$WORD_SIZE)",
+             $"%rv = call i$WORD_SIZE @cudanativeCompileKernel(i$WORD_SIZE %0)
+               ret i$WORD_SIZE %rv"), Ptr{Cvoid}, Tuple{Int}, id)
+end
+
 const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}()
 @generated function delayed_cufunction(::Val{f}, ::Val{tt}) where {f,tt}
     global delayed_cufunctions
@@ -453,10 +466,8 @@ const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}()
     id = length(delayed_cufunctions)
 
     quote
-        # drop a marker which will get picked up during compilation
         # TODO: add an edge to this method instance to support method redefinitions
-        fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), $id)
-
+        fptr = cudanativeCompileKernel($id)
         DeviceKernel{f,tt}(fptr)
     end
 end

From 8d2131f805bbcf29c3105aa86f662523cac257e6 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 22 Mar 2019 10:54:35 +0100
Subject: [PATCH 32/34] Export dynamic_cufunction.

---
 src/execution.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/execution.jl b/src/execution.jl
index 28fb485e..079edb19 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -1,6 +1,6 @@
 # Native execution support
 
-export @cuda, cudaconvert, cufunction, nearest_warpsize
+export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize
 
 
 ## helper functions

From 0144cfc1638ab3c4b0a4db3f6998425d02865c5b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 22 Mar 2019 11:07:38 +0100
Subject: [PATCH 33/34] Test complex argument passing.

---
 test/device/execution.jl | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/test/device/execution.jl b/test/device/execution.jl
index edc8c107..96876afd 100644
--- a/test/device/execution.jl
+++ b/test/device/execution.jl
@@ -860,6 +860,8 @@ end
 end
 
 @testset "argument passing" begin
+    ## padding
+
     function kernel(a, b, c)
         @cuprintf("%ld %ld %ld", Int64(a), Int64(b), Int64(c))
         return
@@ -875,6 +877,23 @@ end
         end
         @test out == "1 2 3"
     end
+
+    ## conversion
+
+    function kernel(a)
+        increment(a) = (a[1] += 1; nothing)
+
+        a[1] = 1
+        increment(a)
+        @cuda dynamic=true increment(a)
+
+        return
+    end
+
+    dA = CuTestArray{Int,1}((1,))
+    @cuda kernel(dA)
+    A = Array(dA)
+    @test A == [3]
 end
 
 @testset "self-recursion" begin

From da510866b5c4b6800d33fae6cb52beff9338296e Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 22 Mar 2019 11:15:57 +0100
Subject: [PATCH 34/34] Version bound the closure+capture test.

---
 test/device/execution.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/device/execution.jl b/test/device/execution.jl
index 96876afd..293c0701 100644
--- a/test/device/execution.jl
+++ b/test/device/execution.jl
@@ -843,6 +843,7 @@ end
     @test out == "Hello, World!"
 end
 
+if VERSION >= v"1.1" # behavior of captured variables (box or not) has improved over time
 @testset "closures" begin
     function hello()
         x = 1
@@ -858,6 +859,7 @@ end
     end
     @test out == "Hello, World 1!"
 end
+end
 
 @testset "argument passing" begin
     ## padding