This repository has been archived by the owner on May 27, 2021. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 55
/
libcudadevrt.jl
67 lines (54 loc) · 2.23 KB
/
libcudadevrt.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# wrappers for the libcudadevrt library
#
# The libcudadevrt library is a collection of PTX bitcode functions that implement parts of
# the CUDA API for execution on the device, such as device synchronization primitives,
# dynamic kernel APIs, etc.
import CUDAdrv: CuDim3, CuStream_t
const cudaError_t = Cint
const cudaStream_t = CUDAdrv.CuStream_t
# device-side counterpart of CUDAdrv.launch
@inline function launch(fptr::Ptr{Cvoid}, blocks::CuDim, threads::CuDim,
shmem::Int, stream::CuStream,
args...)
blocks = CuDim3(blocks)
threads = CuDim3(threads)
buf = parameter_buffer(fptr, blocks, threads, shmem, args...)
ccall("extern cudaLaunchDeviceV2", llvmcall, cudaError_t,
(Ptr{Cvoid}, cudaStream_t),
buf, stream)
return
end
@generated function parameter_buffer(fptr::Ptr{Cvoid}, blocks::CuDim3, threads::CuDim3,
shmem::Int, args...)
# allocate a buffer
ex = quote
buf = ccall("extern cudaGetParameterBufferV2", llvmcall, Ptr{Cvoid},
(Ptr{Cvoid}, CuDim3, CuDim3, Cuint),
fptr, blocks, threads, shmem)
end
# store the parameters
#
# > Each individual parameter placed in the parameter buffer is required to be aligned.
# > That is, each parameter must be placed at the n-th byte in the parameter buffer,
# > where n is the smallest multiple of the parameter size that is greater than the
# > offset of the last byte taken by the preceding parameter. The maximum size of the
# > parameter buffer is 4KB.
offset = 0
for i in 1:length(args)
buf_index = Base.ceil(Int, offset / sizeof(args[i])) + 1
offset = buf_index * sizeof(args[i])
push!(ex.args, :(
unsafe_store!(Base.unsafe_convert(Ptr{$(args[i])}, buf), args[$i], $buf_index)
))
end
push!(ex.args, :(return buf))
return ex
end
"""
synchronize()
Wait for the device to finish. This is the device side version,
and should not be called from the host.
`synchronize` acts as a synchronization point for
child grids in the context of dynamic parallelism.
"""
@inline synchronize() = ccall("extern cudaDeviceSynchronize", llvmcall, Cint, ())