This repository has been archived by the owner on May 27, 2021. It is now read-only.
/
reflection.jl
149 lines (128 loc) · 5.43 KB
/
reflection.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# code reflection entry-points
export
@code_lowered, @code_typed, @code_warntype,
code_llvm, code_ptx, code_sass, @code_llvm, @code_ptx, @code_sass
#
# code_* replacements
#
# Return the capability of the current context's device, or a sane fall-back.
function current_capability()
fallback = v"2.0"
if !initialized[]
return fallback
end
ctx = CuCurrentContext()
if isnull(ctx)
return fallback
end
return capability(device(ctx))
end
"""
code_llvm([io], f, types; optimize=true, dump_module=false, cap::VersionNumber)
Prints the LLVM IR generated for the method matching the given generic function and type
signature to `io` which defaults to `STDOUT`. The IR is optimized according to `optimize`
(defaults to true), and the entire module, including headers and other functions, is dumped
if `dump_module` is set (defaults to false). The device capability `cap` to generate code
for defaults to the current active device's capability, or v"2.0" if there is no such active
context.
"""
function code_llvm(io::IO, func::ANY, types::ANY=Tuple;
optimize::Bool=true, dump_module::Bool=false,
cap::VersionNumber=current_capability(), kernel::Bool=false)
tt = Base.to_tuple_type(types)
check_invocation(func, tt; kernel=kernel)
mod = irgen(func, tt)
entry = add_entry!(mod, func, tt; kernel=kernel)
if optimize
optimize!(mod, entry, cap)
end
if dump_module
show(io, mod)
else
show(io, entry)
end
end
code_llvm(func::ANY, types::ANY=Tuple; kwargs...) = code_llvm(STDOUT, func, types; kwargs...)
"""
code_ptx([io], f, types; cap::VersionNumber, kernel::Bool=false)
Prints the PTX assembly generated for the method matching the given generic function and
type signature to `io` which defaults to `STDOUT`. The device capability `cap` to generate
code for defaults to the current active device's capability, or v"2.0" if there is no such
active context. The optional `kernel` parameter indicates whether the function in question
is an entry-point function, or a regular device function.
"""
function code_ptx(io::IO, func::ANY, types::ANY=Tuple;
cap::VersionNumber=current_capability(), kernel::Bool=false)
@assert isa(func, Core.Function)
tt = Base.to_tuple_type(types)
check_invocation(func, tt; kernel=kernel)
ptx,_ = compile_function(func, tt, cap; kernel=kernel)
# TODO: this code contains all the functions in the call chain,
# is it possible to implement `dump_module`?
print(io, ptx)
end
code_ptx(func::ANY, types::ANY=Tuple; kwargs...) = code_ptx(STDOUT, func, types; kwargs...)
"""
code_sass([io], f, types, cap::VersionNumber)
Prints the SASS code generated for the method matching the given generic function and type
signature to `io` which defaults to `STDOUT`. The device capability `cap` to generate code
for defaults to the current active device's capability, or v"2.0" if there is no such active
context.
Note that the method needs to be a valid entry-point kernel, ie. it should not return any
values.
"""
function code_sass(io::IO, func::ANY, types::ANY=Tuple;
cap::VersionNumber=current_capability())
@assert isa(func, Core.Function)
tt = Base.to_tuple_type(types)
check_invocation(func, tt; kernel=true)
ptx,_ = compile_function(func, tt, cap)
fn = tempname()
gpu = "sm_$(cap.major)$(cap.minor)"
# NOTE: this might not match what is being executed, due to the PTX->SASS conversion
# by the driver possibly not matching what `ptxas` (part of the toolkit) does.
# TODO: see how `nvvp` extracts SASS code when doing PC sampling, and copy that.
Base.run(`$ptxas --gpu-name $gpu --output-file $fn --input-as-string $ptx`)
try
print(io, readstring(`$cuobjdump --dump-sass $fn`))
finally
rm(fn)
end
end
code_sass(func::ANY, types::ANY=Tuple; kwargs...) = code_sass(STDOUT, func, types; kwargs...)
#
# @code_* replacements
#
function gen_call_with_extracted_types(f, ex)
:($f($(esc(ex.args[1])), Base.typesof(cudaconvert.(($(esc.(ex.args[2:end])...),))...)))
end
for (fname,kernel_arg) in [(:code_lowered, false), (:code_typed, false), (:code_warntype, false),
(:code_llvm, true), (:code_ptx, true), (:code_sass, false)]
# TODO: test the kernel_arg-based behavior
@eval begin
@doc $"""
$fname
Extracts the relevant function call from any `@cuda` invocation, evaluates the
arguments to the function or macro call, determines their types (taking into account
GPU-specific type conversions), and calls $fname on the resulting expression.
Can be applied to a pure function call, or a call prefixed with the `@cuda` macro.
In that case, kernel code generation conventions are used (wrt. argument conversions,
return values, etc).
""" macro $(fname)(ex0)
if ex0.head == :macrocall
# @cuda (...) f()
if Base.VERSION >= v"0.7.0-DEV.357"
ex0 = ex0.args[4]
else
ex0 = ex0.args[3]
end
kernel = true
else
kernel = false
end
wrapper(func, types) = $kernel_arg ? $fname(func, types, kernel = kernel) :
$fname(func, types)
gen_call_with_extracted_types(wrapper, ex0)
end
end
end