This repository has been archived by the owner on May 27, 2021. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 55
/
optim.jl
391 lines (321 loc) · 14 KB
/
optim.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# LLVM IR optimization
function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
tm = machine(job.cap, triple(mod))
if job.kernel
entry = promote_kernel!(job, mod, entry)
end
function initialize!(pm)
add_library_info!(pm, triple(mod))
add_transform_info!(pm, tm)
internalize!(pm, [LLVM.name(entry)])
end
global current_job
current_job = job
# Julia-specific optimizations
#
# NOTE: we need to use multiple distinct pass managers to force pass ordering;
# intrinsics should never get lowered before Julia has optimized them.
if VERSION < v"1.2.0-DEV.375"
# with older versions of Julia, intrinsics are lowered unconditionally so we need to
# replace them with GPU-compatible counterparts before anything else. that breaks
# certain optimizations though: https://github.com/JuliaGPU/CUDAnative.jl/issues/340
ModulePassManager() do pm
initialize!(pm)
add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!))
aggressive_dce!(pm) # remove dead uses of ptls
add!(pm, ModulePass("LowerPTLS", lower_ptls!))
run!(pm, mod)
end
ModulePassManager() do pm
initialize!(pm)
ccall(:jl_add_optimization_passes, Cvoid,
(LLVM.API.LLVMPassManagerRef, Cint),
LLVM.ref(pm), Base.JLOptions().opt_level)
run!(pm, mod)
end
else
ModulePassManager() do pm
initialize!(pm)
ccall(:jl_add_optimization_passes, Cvoid,
(LLVM.API.LLVMPassManagerRef, Cint, Cint),
LLVM.ref(pm), Base.JLOptions().opt_level, #=lower_intrinsics=# 0)
run!(pm, mod)
end
ModulePassManager() do pm
initialize!(pm)
# lower intrinsics
add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!))
aggressive_dce!(pm) # remove dead uses of ptls
add!(pm, ModulePass("LowerPTLS", lower_ptls!))
# the Julia GC lowering pass also has some clean-up that is required
if VERSION >= v"1.2.0-DEV.520"
# TODO: move this to LLVM.jl
function LLVMAddLateLowerGCFramePass(PM::LLVM.API.LLVMPassManagerRef)
LLVM.@apicall(:LLVMExtraAddLateLowerGCFramePass,Cvoid,(LLVM.API.LLVMPassManagerRef,), PM)
end
LLVMAddLateLowerGCFramePass(LLVM.ref(pm))
end
run!(pm, mod)
end
end
# PTX-specific optimizations
ModulePassManager() do pm
initialize!(pm)
# NVPTX's target machine info enables runtime unrolling,
# but Julia's pass sequence only invokes the simple unroller.
loop_unroll!(pm)
instruction_combining!(pm) # clean-up redundancy
licm!(pm) # the inner runtime check might be outer loop invariant
# the above loop unroll pass might have unrolled regular, non-runtime nested loops.
# that code still needs to be optimized (arguably, multiple unroll passes should be
# scheduled by the Julia optimizer). do so here, instead of re-optimizing entirely.
early_csemem_ssa!(pm) # TODO: gvn instead? see NVPTXTargetMachine.cpp::addEarlyCSEOrGVNPass
dead_store_elimination!(pm)
constant_merge!(pm)
# NOTE: if an optimization is missing, try scheduling an entirely new optimization
# to see which passes need to be added to the list above
# LLVM.clopts("-print-after-all", "-filter-print-funcs=$(LLVM.name(entry))")
# ModulePassManager() do pm
# add_library_info!(pm, triple(mod))
# add_transform_info!(pm, tm)
# PassManagerBuilder() do pmb
# populate!(pm, pmb)
# end
# run!(pm, mod)
# end
cfgsimplification!(pm)
# get rid of the internalized functions; now possible unused
global_dce!(pm)
run!(pm, mod)
end
# we compile a module containing the entire call graph,
# so perform some interprocedural optimizations.
#
# for some reason, these passes need to be distinct from the regular optimization chain,
# or certain values (such as the constant arrays used to populare llvm.compiler.user ad
# part of the LateLowerGCFrame pass) aren't collected properly.
#
# these might not always be safe, as Julia's IR metadata isn't designed for IPO.
ModulePassManager() do pm
dead_arg_elimination!(pm) # parent doesn't use return value --> ret void
run!(pm, mod)
end
return entry
end
## kernel-specific optimizations
# promote a function to a kernel
# FIXME: sig vs tt (code_llvm vs cufunction)
function promote_kernel!(job::CompilerJob, mod::LLVM.Module, entry_f::LLVM.Function)
kernel = wrap_entry!(job, mod, entry_f)
# property annotations
# TODO: belongs in irgen? doesn't maxntidx doesn't appear in ptx code?
annotations = LLVM.Value[kernel]
## kernel metadata
append!(annotations, [MDString("kernel"), ConstantInt(Int32(1), JuliaContext())])
## expected CTA sizes
if job.minthreads != nothing
bounds = CUDAdrv.CuDim3(job.minthreads)
for dim in (:x, :y, :z)
bound = getfield(bounds, dim)
append!(annotations, [MDString("reqntid$dim"),
ConstantInt(Int32(bound), JuliaContext())])
end
end
if job.maxthreads != nothing
bounds = CUDAdrv.CuDim3(job.maxthreads)
for dim in (:x, :y, :z)
bound = getfield(bounds, dim)
append!(annotations, [MDString("maxntid$dim"),
ConstantInt(Int32(bound), JuliaContext())])
end
end
if job.blocks_per_sm != nothing
append!(annotations, [MDString("minctasm"),
ConstantInt(Int32(job.blocks_per_sm), JuliaContext())])
end
if job.maxregs != nothing
append!(annotations, [MDString("maxnreg"),
ConstantInt(Int32(job.maxregs), JuliaContext())])
end
push!(metadata(mod), "nvvm.annotations", MDNode(annotations))
return kernel
end
function wrapper_type(julia_t::Type, codegen_t::LLVMType)::LLVMType
if !isbitstype(julia_t)
# don't pass jl_value_t by value; it's an opaque structure
return codegen_t
elseif isa(codegen_t, LLVM.PointerType) && !(julia_t <: Ptr)
# we didn't specify a pointer, but codegen passes one anyway.
# make the wrapper accept the underlying value instead.
return eltype(codegen_t)
else
return codegen_t
end
end
# generate a kernel wrapper to fix & improve argument passing
function wrap_entry!(job::CompilerJob, mod::LLVM.Module, entry_f::LLVM.Function)
entry_ft = eltype(llvmtype(entry_f)::LLVM.PointerType)::LLVM.FunctionType
@compiler_assert return_type(entry_ft) == LLVM.VoidType(JuliaContext()) job
# filter out ghost types, which don't occur in the LLVM function signatures
sig = Base.signature_type(job.f, job.tt)::Type
julia_types = Type[]
for dt::Type in sig.parameters
if !isghosttype(dt)
push!(julia_types, dt)
end
end
# generate the wrapper function type & definition
wrapper_types = LLVM.LLVMType[wrapper_type(julia_t, codegen_t)
for (julia_t, codegen_t)
in zip(julia_types, parameters(entry_ft))]
wrapper_fn = replace(LLVM.name(entry_f), r"^.+?_"=>"ptxcall_") # change the CC tag
wrapper_ft = LLVM.FunctionType(LLVM.VoidType(JuliaContext()), wrapper_types)
wrapper_f = LLVM.Function(mod, wrapper_fn, wrapper_ft)
# emit IR performing the "conversions"
let builder = Builder(JuliaContext())
entry = BasicBlock(wrapper_f, "entry", JuliaContext())
position!(builder, entry)
wrapper_args = Vector{LLVM.Value}()
# perform argument conversions
codegen_types = parameters(entry_ft)
wrapper_params = parameters(wrapper_f)
param_index = 0
for (julia_t, codegen_t, wrapper_t, wrapper_param) in
zip(julia_types, codegen_types, wrapper_types, wrapper_params)
param_index += 1
if codegen_t != wrapper_t
# the wrapper argument doesn't match the kernel parameter type.
# this only happens when codegen wants to pass a pointer.
@compiler_assert isa(codegen_t, LLVM.PointerType) job
@compiler_assert eltype(codegen_t) == wrapper_t job
# copy the argument value to a stack slot, and reference it.
ptr = alloca!(builder, wrapper_t)
if LLVM.addrspace(codegen_t) != 0
ptr = addrspacecast!(builder, ptr, codegen_t)
end
store!(builder, wrapper_param, ptr)
push!(wrapper_args, ptr)
else
push!(wrapper_args, wrapper_param)
for attr in collect(parameter_attributes(entry_f, param_index))
push!(parameter_attributes(wrapper_f, param_index), attr)
end
end
end
call!(builder, entry_f, wrapper_args)
ret!(builder)
dispose(builder)
end
# early-inline the original entry function into the wrapper
push!(function_attributes(entry_f), EnumAttribute("alwaysinline", 0, JuliaContext()))
linkage!(entry_f, LLVM.API.LLVMInternalLinkage)
fixup_metadata!(entry_f)
ModulePassManager() do pm
always_inliner!(pm)
verifier!(pm)
run!(pm, mod)
end
return wrapper_f
end
# HACK: get rid of invariant.load and const TBAA metadata on loads from pointer args,
# since storing to a stack slot violates the semantics of those attributes.
# TODO: can we emit a wrapper that doesn't violate Julia's metadata?
function fixup_metadata!(f::LLVM.Function)
for param in parameters(f)
if isa(llvmtype(param), LLVM.PointerType)
# collect all uses of the pointer
worklist = Vector{LLVM.Instruction}(user.(collect(uses(param))))
while !isempty(worklist)
value = popfirst!(worklist)
# remove the invariant.load attribute
md = metadata(value)
if haskey(md, LLVM.MD_invariant_load)
delete!(md, LLVM.MD_invariant_load)
end
if haskey(md, LLVM.MD_tbaa)
delete!(md, LLVM.MD_tbaa)
end
# recurse on the output of some instructions
if isa(value, LLVM.BitCastInst) ||
isa(value, LLVM.GetElementPtrInst) ||
isa(value, LLVM.AddrSpaceCastInst)
append!(worklist, user.(collect(uses(value))))
end
# IMPORTANT NOTE: if we ever want to inline functions at the LLVM level,
# we need to recurse into call instructions here, and strip metadata from
# called functions (see CUDAnative.jl#238).
end
end
end
end
# lower object allocations to to PTX malloc
#
# this is a PoC implementation that is very simple: allocate, and never free. it also runs
# _before_ Julia's GC lowering passes, so we don't get to use the results of its analyses.
# when we ever implement a more potent GC, we will need those results, but the relevant pass
# is currently very architecture/CPU specific: hard-coded pool sizes, TLS references, etc.
# such IR is hard to clean-up, so we probably will need to have the GC lowering pass emit
# lower-level intrinsics which then can be lowered to architecture-specific code.
function lower_gc_frame!(fun::LLVM.Function)
job = current_job::CompilerJob
mod = LLVM.parent(fun)
changed = false
# plain alloc
if haskey(functions(mod), "julia.gc_alloc_obj")
alloc_obj = functions(mod)["julia.gc_alloc_obj"]
alloc_obj_ft = eltype(llvmtype(alloc_obj))
T_prjlvalue = return_type(alloc_obj_ft)
T_pjlvalue = convert(LLVMType, Any, true)
for use in uses(alloc_obj)
call = user(use)::LLVM.CallInst
# decode the call
ops = collect(operands(call))
sz = ops[2]
# replace with PTX alloc_obj
let builder = Builder(JuliaContext())
position!(builder, call)
ptr = call!(builder, Runtime.get(:gc_pool_alloc), [sz])
replace_uses!(call, ptr)
dispose(builder)
end
unsafe_delete!(LLVM.parent(call), call)
changed = true
end
@compiler_assert isempty(uses(alloc_obj)) job
end
# we don't care about write barriers
if haskey(functions(mod), "julia.write_barrier")
barrier = functions(mod)["julia.write_barrier"]
for use in uses(barrier)
call = user(use)::LLVM.CallInst
unsafe_delete!(LLVM.parent(call), call)
changed = true
end
@compiler_assert isempty(uses(barrier)) job
end
return changed
end
# lower the `julia.ptls_states` intrinsic by removing it, since it is GPU incompatible.
#
# this assumes and checks that the TLS is unused, which should be the case for most GPU code
# after lowering the GC intrinsics to TLS-less code and having run DCE.
#
# TODO: maybe don't have Julia emit actual uses of the TLS, but use intrinsics instead,
# making it easier to remove or reimplement that functionality here.
function lower_ptls!(mod::LLVM.Module)
job = current_job::CompilerJob
changed = false
if haskey(functions(mod), "julia.ptls_states")
ptls_getter = functions(mod)["julia.ptls_states"]
for use in uses(ptls_getter)
val = user(use)
if !isempty(uses(val))
error("Thread local storage is not implemented")
end
unsafe_delete!(LLVM.parent(val), val)
changed = true
end
@compiler_assert isempty(uses(ptls_getter)) job
end
return changed
end