diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index eae9ae3f..887078fb 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -58,10 +58,13 @@ function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) add!(pm, ModulePass("LowerPTLS", lower_ptls!)) # the Julia GC lowering pass also has some clean-up that is required - function LLVMAddLateLowerGCFramePass(PM::LLVM.API.LLVMPassManagerRef) - LLVM.@apicall(:LLVMExtraAddLateLowerGCFramePass,Cvoid,(LLVM.API.LLVMPassManagerRef,), PM) + if VERSION >= v"1.2.0-DEV.520" + # TODO: move this to LLVM.jl + function LLVMAddLateLowerGCFramePass(PM::LLVM.API.LLVMPassManagerRef) + LLVM.@apicall(:LLVMExtraAddLateLowerGCFramePass,Cvoid,(LLVM.API.LLVMPassManagerRef,), PM) + end + LLVMAddLateLowerGCFramePass(LLVM.ref(pm)) end - LLVMAddLateLowerGCFramePass(LLVM.ref(pm)) run!(pm, mod) end diff --git a/src/device/cuda/libcudadevrt.jl b/src/device/cuda/libcudadevrt.jl index 59f7c969..03ca74c8 100644 --- a/src/device/cuda/libcudadevrt.jl +++ b/src/device/cuda/libcudadevrt.jl @@ -9,6 +9,25 @@ import CUDAdrv: CuDim3, CuStream_t const cudaError_t = Cint const cudaStream_t = CUDAdrv.CuStream_t +if VERSION >= v"1.2.0-DEV.512" + @inline cudaLaunchDevice(buf::Ptr{Cvoid}, stream::CuStream) = + ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t, + (Ptr{Cvoid}, cudaStream_t), + buf, stream) +else + import Base.Sys: WORD_SIZE + # declare i32 @cudaLaunchDeviceV2(i8*, %struct.CUstream_st*) + @eval @inline cudaLaunchDevice(buf::Ptr{Cvoid}, stream::CuStream) = + Base.llvmcall( + ( "declare i32 @cudaLaunchDeviceV2(i8*, i8*)", + $"%buf = inttoptr i$WORD_SIZE %0 to i8* + %stream = inttoptr i$WORD_SIZE %1 to i8* + %rv = call i32 @cudaLaunchDeviceV2(i8* %buf, i8* %stream) + ret i32 %rv"), cudaError_t, + Tuple{Ptr{Cvoid}, cudaStream_t}, + buf, Base.unsafe_convert(cudaStream_t, stream)) +end + # device-side counterpart of CUDAdrv.launch @inline function launch(f::Ptr{Cvoid}, blocks::CuDim, threads::CuDim, shmem::Int, stream::CuStream, @@ -17,21 +36,48 @@ const cudaStream_t = CUDAdrv.CuStream_t threads = CuDim3(threads) buf = parameter_buffer(f, blocks, threads, shmem, args...) - - ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t, - (Ptr{Cvoid}, cudaStream_t), - buf, stream) + cudaLaunchDevice(buf, stream) return end +if VERSION >= v"1.2.0-DEV.512" + @inline cudaGetParameterBuffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, shmem::Integer) = + ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid}, + (Ptr{Cvoid}, CuDim3, CuDim3, Cuint), + f, blocks, threads, shmem) +else + @inline cudaGetParameterBuffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, shmem::Integer) = + cudaGetParameterBuffer(f, + blocks.x, blocks.y, blocks.z, + threads.x, threads.y, threads.z, + convert(Cuint, shmem)) + # declare i8* @cudaGetParameterBufferV2(i8*, %struct.dim3, %struct.dim3, i32) + @eval @inline cudaGetParameterBuffer(f::Ptr{Cvoid}, + blocks_x::Cuint, blocks_y::Cuint, blocks_z::Cuint, + threads_x::Cuint, threads_y::Cuint, threads_z::Cuint, + shmem::Cuint) = + Base.llvmcall( + ( "declare i8* @cudaGetParameterBufferV2(i8*, {i32,i32,i32}, {i32,i32,i32}, i32)", + $"%f = inttoptr i$WORD_SIZE %0 to i8* + %blocks.x = insertvalue { i32, i32, i32 } undef, i32 %1, 0 + %blocks.y = insertvalue { i32, i32, i32 } %blocks.x, i32 %2, 1 + %blocks.z = insertvalue { i32, i32, i32 } %blocks.y, i32 %3, 2 + %threads.x = insertvalue { i32, i32, i32 } undef, i32 %4, 0 + %threads.y = insertvalue { i32, i32, i32 } %threads.x, i32 %5, 1 + %threads.z = insertvalue { i32, i32, i32 } %threads.y, i32 %6, 2 + %rv = call i8* @cudaGetParameterBufferV2(i8* %f, {i32,i32,i32} %blocks.z, {i32,i32,i32} %threads.z, i32 %7) + %buf = ptrtoint i8* %rv to i$WORD_SIZE + ret i$WORD_SIZE %buf"), Ptr{Cvoid}, + Tuple{Ptr{Cvoid}, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint}, + f, blocks_x, blocks_y, blocks_z, threads_x, threads_y, threads_z, shmem) +end + @generated function parameter_buffer(f::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3, shmem::Int, args...) # allocate a buffer ex = quote - buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid}, - (Ptr{Cvoid}, CuDim3, CuDim3, Cuint), - f, blocks, threads, shmem) + buf = cudaGetParameterBuffer(f, blocks, threads, shmem) end # store the parameters @@ -55,6 +101,16 @@ end return ex end +if VERSION >= v"1.2.0-DEV.512" + @inline synchronize() = ccall("extern cudaDeviceSynchronize", llvmcall, Cint, ()) +else + @eval @inline synchronize() = + Base.llvmcall( + ("declare i32 @cudaDeviceSynchronize()", + "%rv = call i32 @cudaDeviceSynchronize() + ret i32 %rv"), cudaError_t, Tuple{}) +end + """ synchronize() @@ -64,4 +120,4 @@ and should not be called from the host. `synchronize` acts as a synchronization point for child grids in the context of dynamic parallelism. """ -@inline synchronize() = ccall("extern cudaDeviceSynchronize", llvmcall, Cint, ()) +synchronize diff --git a/src/execution.jl b/src/execution.jl index 12011364..28fb485e 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -446,6 +446,19 @@ No keyword arguments are supported. @inline dynamic_cufunction(f::Core.Function, tt::Type=Tuple{}) = delayed_cufunction(Val(f), Val(tt)) +# marker function that will get picked up during compilation +if VERSION >= v"1.2.0-DEV.512" + @inline cudanativeCompileKernel(id::Int) = + ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), id) +else + import Base.Sys: WORD_SIZE + @eval @inline cudanativeCompileKernel(id::Int) = + Base.llvmcall( + ($"declare i$WORD_SIZE @cudanativeCompileKernel(i$WORD_SIZE)", + $"%rv = call i$WORD_SIZE @cudanativeCompileKernel(i$WORD_SIZE %0) + ret i$WORD_SIZE %rv"), Ptr{Cvoid}, Tuple{Int}, id) +end + const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}() @generated function delayed_cufunction(::Val{f}, ::Val{tt}) where {f,tt} global delayed_cufunctions @@ -453,10 +466,8 @@ const delayed_cufunctions = Vector{Tuple{Core.Function,Type}}() id = length(delayed_cufunctions) quote - # drop a marker which will get picked up during compilation # TODO: add an edge to this method instance to support method redefinitions - fptr = ccall("extern cudanativeCompileKernel", llvmcall, Ptr{Cvoid}, (Int,), $id) - + fptr = cudanativeCompileKernel($id) DeviceKernel{f,tt}(fptr) end end