This repository has been archived by the owner on May 27, 2021. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 55
/
execution.jl
131 lines (104 loc) · 4.47 KB
/
execution.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Native execution support
export @cuda, nearest_warpsize, cudaconvert
using Base.Iterators: filter
"""
cudaconvert(x)
This function is called for every argument to be passed to a kernel, allowing it to be
converted to a GPU-friendly format. By default, the function does nothing and returns the
input object `x` as-is.
For `CuArray` objects, a corresponding `CuDeviceArray` object in global space is returned,
which implements GPU-compatible array functionality.
"""
function cudaconvert(x::T) where {T}
if T.layout == C_NULL || !Base.datatype_pointerfree(T)
error("don't know how to handle argument of type $T")
end
return x
end
cudaconvert(x::Tuple) = cudaconvert.(x)
# fast lookup of global world age
world_age() = ccall(:jl_get_tls_world_age, UInt, ())
# slow lookup of local method age
function method_age(f, tt)
for m in Base._methods(f, tt, 1, typemax(UInt))
return m[3].min_world
end
return -1
end
isghosttype(dt) = !dt.mutable && sizeof(dt) == 0
"""
@cuda (gridDim::CuDim, blockDim::CuDim, [shmem::Int], [stream::CuStream]) func(args...)
High-level interface for calling functions on a GPU, queues a kernel launch on the current
context. The `gridDim` and `blockDim` arguments represent the launch configuration, the
optional `shmem` parameter specifies how much bytes of dynamic shared memory should be
allocated (defaulting to 0), while the optional `stream` parameter indicates on which stream
the launch should be scheduled.
The `func` argument should be a valid Julia function. It will be compiled to a CUDA function
upon first use, and to a certain extent arguments will be converted and managed
automatically (see [`cudaconvert`](@ref)). Finally, a call to `cudacall` is performed,
scheduling the compiled function for execution on the GPU.
"""
macro cuda(config::Expr, callexpr::Expr)
# sanity checks
if config.head != :tuple || !(2 <= length(config.args) <= 4)
throw(ArgumentError("first argument to @cuda should be a tuple (gridDim, blockDim, [shmem], [stream])"))
end
if callexpr.head != :call
throw(ArgumentError("second argument to @cuda should be a function call"))
end
# handle optional arguments and forward the call
# NOTE: we duplicate the CUDAdrv's default values of these arguments,
# because the kwarg version of `cudacall` is too slow
stream = length(config.args)==4 ? esc(pop!(config.args)) : :(CuDefaultStream())
shmem = length(config.args)==3 ? esc(pop!(config.args)) : :(0)
dims = esc(config)
args = :(cudaconvert.(($(map(esc, callexpr.args)...),)))
return :(_cuda($dims, $shmem, $stream, $args...))
end
const agecache = Dict{UInt, UInt}()
const compilecache = Dict{UInt, CuFunction}()
@generated function _cuda(dims::Tuple{CuDim, CuDim}, shmem, stream,
func::Core.Function, argspec...)
arg_exprs = [:( argspec[$i] ) for i in 1:length(argspec)]
arg_types = argspec
# filter out ghost arguments
real_args = map(t->!isghosttype(t), arg_types)
real_arg_types = map(x->x[2], filter(x->x[1], zip(real_args, arg_types)))
real_arg_exprs = map(x->x[2], filter(x->x[1], zip(real_args, arg_exprs)))
precomp_key = hash(tuple(func, arg_types...)) # precomputable part of the keys
quote
Base.@_inline_meta
# look-up the method age
key1 = hash(($precomp_key, world_age()))
if haskey(agecache, key1)
age = agecache[key1]
else
age = method_age(func, $arg_types)
age == -1 && throw(MethodError(func, Tuple{$arg_types...}))
agecache[key1] = age
end
# compile the function
ctx = CuCurrentContext()
key2 = hash(($precomp_key, age, ctx))
if haskey(compilecache, key2)
cuda_fun = compilecache[key2]
else
cuda_fun, _ = cufunction(device(ctx), func, Tuple{$arg_types...})
compilecache[key2] = cuda_fun
end
# call the kernel
Profile.@launch begin
cudacall(cuda_fun, dims[1], dims[2], shmem, stream,
Tuple{$(real_arg_types...)}, $(real_arg_exprs...))
end
end
end
"""
Return the nearest number of threads that is a multiple of the warp size of a device:
nearest_warpsize(dev::CuDevice, threads::Integer)
This is a common requirement, eg. when using shuffle intrinsics.
"""
function nearest_warpsize(dev::CuDevice, threads::Integer)
ws = CUDAdrv.warpsize(dev)
return threads + (ws - threads % ws) % ws
end