diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index eae9ae3f..887078fb 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -58,10 +58,13 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
             add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 
             # the Julia GC lowering pass also has some clean-up that is required
-            function LLVMAddLateLowerGCFramePass(PM::LLVM.API.LLVMPassManagerRef)
-                LLVM.@apicall(:LLVMExtraAddLateLowerGCFramePass,Cvoid,(LLVM.API.LLVMPassManagerRef,), PM)
+            if VERSION >= v"1.2.0-DEV.520"
+                # TODO: move this to LLVM.jl
+                function LLVMAddLateLowerGCFramePass(PM::LLVM.API.LLVMPassManagerRef)
+                    LLVM.@apicall(:LLVMExtraAddLateLowerGCFramePass,Cvoid,(LLVM.API.LLVMPassManagerRef,), PM)
+                end
+                LLVMAddLateLowerGCFramePass(LLVM.ref(pm))
             end
-            LLVMAddLateLowerGCFramePass(LLVM.ref(pm))
 
             run!(pm, mod)
         end
diff --git a/src/device/cuda/libcudadevrt.jl b/src/device/cuda/libcudadevrt.jl
index 59f7c969..03ca74c8 100644
--- a/src/device/cuda/libcudadevrt.jl
+++ b/src/device/cuda/libcudadevrt.jl
@@ -9,6 +9,25 @@ import CUDAdrv: CuDim3, CuStream_t
 const cudaError_t = Cint
 const cudaStream_t = CUDAdrv.CuStream_t
 
+if VERSION >= v"1.2.0-DEV.512"
+    @inline cudaLaunchDevice(buf::Ptr{Cvoid}, stream::CuStream) =
+        ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
+              (Ptr{Cvoid}, cudaStream_t),
+              buf, stream)
+else
+    import Base.Sys: WORD_SIZE
+    # declare i32 @cudaLaunchDeviceV2(i8*, %struct.CUstream_st*)
+    @eval @inline cudaLaunchDevice(buf::Ptr{Cvoid}, stream::CuStream) =
+            Base.llvmcall(
+                ( "declare i32 @cudaLaunchDeviceV2(i8*, i8*)",
+                 $"%buf = inttoptr i$WORD_SIZE %0 to i8*
+                   %stream = inttoptr i$WORD_SIZE %1 to i8*
+                   %rv = call i32 @cudaLaunchDeviceV2(i8* %buf, i8* %stream)
+                   ret i32 %rv"), cudaError_t,
+                Tuple{Ptr{Cvoid}, cudaStream_t},
+                buf, Base.unsafe_convert(cudaStream_t, stream))
+end
+
 # device-side counterpart of CUDAdrv.launch
 @inline function launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim,
                         shmem::Int, stream::CuStream,
@@ -17,21 +36,48 @@ const cudaStream_t = CUDAdrv.CuStream_t
     threads = CuDim3(threads)
 
     buf = parameter_buffer(f, blocks, threads, shmem, args...)
-
-    ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
-          (Ptr{Cvoid}, cudaStream_t),
-          buf, stream)
+    cudaLaunchDevice(buf, stream)
 
     return
 end
 
+if VERSION >= v"1.2.0-DEV.512"
+    @inline cudaGetParameterBuffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, shmem::Integer) =
+        ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
+                            (Ptr{Cvoid}, CuDim3, CuDim3, Cuint),
+                            f, blocks, threads, shmem)
+else
+    @inline cudaGetParameterBuffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, shmem::Integer) =
+        cudaGetParameterBuffer(f,
+                               blocks.x, blocks.y, blocks.z,
+                               threads.x, threads.y, threads.z,
+                               convert(Cuint, shmem))
+    # declare i8* @cudaGetParameterBufferV2(i8*, %struct.dim3, %struct.dim3, i32)
+    @eval @inline cudaGetParameterBuffer(f::Ptr{Cvoid},
+                                         blocks_x::Cuint, blocks_y::Cuint, blocks_z::Cuint,
+                                         threads_x::Cuint, threads_y::Cuint, threads_z::Cuint,
+                                         shmem::Cuint) =
+            Base.llvmcall(
+                ( "declare i8* @cudaGetParameterBufferV2(i8*, {i32,i32,i32}, {i32,i32,i32}, i32)",
+                 $"%f = inttoptr i$WORD_SIZE %0 to i8*
+                   %blocks.x = insertvalue { i32, i32, i32 } undef, i32 %1, 0
+                   %blocks.y = insertvalue { i32, i32, i32 } %blocks.x, i32 %2, 1
+                   %blocks.z = insertvalue { i32, i32, i32 } %blocks.y, i32 %3, 2
+                   %threads.x = insertvalue { i32, i32, i32 } undef, i32 %4, 0
+                   %threads.y = insertvalue { i32, i32, i32 } %threads.x, i32 %5, 1
+                   %threads.z = insertvalue { i32, i32, i32 } %threads.y, i32 %6, 2
+                   %rv = call i8* @cudaGetParameterBufferV2(i8* %f, {i32,i32,i32} %blocks.z, {i32,i32,i32} %threads.z, i32 %7)
+                   %buf = ptrtoint i8* %rv to i$WORD_SIZE
+                   ret i$WORD_SIZE %buf"), Ptr{Cvoid},
+                Tuple{Ptr{Cvoid}, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint},
+                f, blocks_x, blocks_y, blocks_z, threads_x, threads_y, threads_z, shmem)
+end
+
 @generated function parameter_buffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3,
                                      shmem::Int, args...)
     # allocate a buffer
     ex = quote
-        buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
-                    (Ptr{Cvoid}, CuDim3, CuDim3, Cuint),
-                    f, blocks, threads, shmem)
+        buf = cudaGetParameterBuffer(f, blocks, threads, shmem)
     end
 
     # store the parameters
@@ -55,6 +101,16 @@ end
     return ex
 end
 
+if VERSION >= v"1.2.0-DEV.512"
+    @inline synchronize() = ccall("extern cudaDeviceSynchronize", llvmcall, Cint, ())
+else
+    @eval @inline synchronize() =
+            Base.llvmcall(
+                ("declare i32 @cudaDeviceSynchronize()",
+                 "%rv = call i32 @cudaDeviceSynchronize()
+                  ret i32 %rv"), cudaError_t, Tuple{})
+end
+
 """
     synchronize()
 
@@ -64,4 +120,4 @@ and should not be called from the host.
 `synchronize` acts as a synchronization point for
 child grids in the context of dynamic parallelism.
 """
-@inline synchronize() = ccall("extern cudaDeviceSynchronize", llvmcall, Cint, ())
+synchronize
diff --git a/src/execution.jl b/src/execution.jl
index 12011364..28fb485e 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -446,6 +446,19 @@ No keyword arguments are supported.
 @inline dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) =
     delayed_cufunction(Val(f), Val(tt))
 
+# marker function that will get picked up during compilation
+if VERSION >= v"1.2.0-DEV.512"
+    @inline cudanativeCompileKernel(id::Int) =
+        ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), id)
+else
+    import Base.Sys: WORD_SIZE
+    @eval @inline cudanativeCompileKernel(id::Int) =
+        Base.llvmcall(
+            ($"declare i$WORD_SIZE @cudanativeCompileKernel(i$WORD_SIZE)",
+             $"%rv = call i$WORD_SIZE @cudanativeCompileKernel(i$WORD_SIZE %0)
+               ret i$WORD_SIZE %rv"), Ptr{Cvoid}, Tuple{Int}, id)
+end
+
 const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}()
 @generated function delayed_cufunction(::Val{f}, ::Val{tt}) where {f,tt}
     global delayed_cufunctions
@@ -453,10 +466,8 @@ const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}()
     id = length(delayed_cufunctions)
 
     quote
-        # drop a marker which will get picked up during compilation
         # TODO: add an edge to this method instance to support method redefinitions
-        fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), $id)
-
+        fptr = cudanativeCompileKernel($id)
         DeviceKernel{f,tt}(fptr)
     end
 end