/
context.jl
413 lines (322 loc) · 11 KB
/
context.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
# Context management
export
CuPrimaryContext, CuContext, current_context, has_context, activate,
unsafe_reset!, isactive, flags, setflags!,
device, device_synchronize
## construction and destruction
@enum_without_prefix CUctx_flags CU_
"""
CuContext(dev::CuDevice, flags=CTX_SCHED_AUTO)
CuContext(f::Function, ...)
Create a CUDA context for device. A context on the GPU is analogous to a process on the CPU,
with its own distinct address space and allocated resources. When a context is destroyed,
the system cleans up the resources allocated to it.
When you are done using the context, call [`CUDA.unsafe_destroy!`](@ref) to mark it for
deletion, or use do-block syntax with this constructor.
"""
mutable struct CuContext
handle::CUcontext
valid::Bool
function CuContext(dev::CuDevice, flags=0)
handle_ref = Ref{CUcontext}()
cuCtxCreate_v2(handle_ref, flags, dev)
UniqueCuContext(handle_ref[])
end
global function current_context()
handle_ref = Ref{CUcontext}()
cuCtxGetCurrent(handle_ref)
handle_ref[] == C_NULL && throw(UndefRefError())
UniqueCuContext(handle_ref[])
end
global UnsafeCuContext(handle::CUcontext) = new(handle, true)
unsafe
end
"""
current_context()
Returns the current context.
!!! warning
This is a low-level API, returning the current context as known to the CUDA driver.
For most users, it is recommended to use the [`context`](@ref) method instead.
"""
current_context()
"""
has_context()
Returns whether there is an active context.
"""
function has_context()
handle_ref = Ref{CUcontext}()
cuCtxGetCurrent(handle_ref)
handle_ref[] != C_NULL
end
# we need to know when a context has been destroyed, to make sure we don't destroy resources
# after the owning context has been destroyed already. this is complicated by the fact that
# contexts obtained from a primary context have the same handle before and after primary
# context destruction, so we cannot use a simple mapping from context handle to a validity
# bit. instead, we unique the context objects and put a validity bit in there.
isvalid(ctx::CuContext) = ctx.valid
function invalidate!(ctx::CuContext)
ctx.valid = false
return
end
# to make this work, every function returning a context (e.g. `cuCtxGetCurrent`, attribute
# functions, etc) need to return the same context objects. because looking up a context is a
# very common operation (often executed from finalizers), we need to ensure this look-up is
# fast and does not switch tasks. we do this by scanning a simple linear vector.
const MAX_CONTEXTS = 1024
const context_objects = Vector{CuContext}(undef, MAX_CONTEXTS)
const context_lock = Base.ThreadSynchronizer()
function UniqueCuContext(handle::CUcontext)
@lock context_lock begin
# look if there's an existing object for this handle
i = 1
@inbounds while i <= MAX_CONTEXTS && isassigned(context_objects, i)
if context_objects[i].handle == handle
if isvalid(context_objects[i])
return context_objects[i]
else
# this object was invalidated, so we can reuse its slot
break
end
end
i += 1
end
if i == MAX_CONTEXTS
error("Exceeded maximum amount of CUDA contexts. This is unexpected; please file an issue.")
end
# we've got a slot we can write to
new_object = UnsafeCuContext(handle)
@inbounds context_objects[i] = new_object
return new_object
end
end
"""
unsafe_destroy!(ctx::CuContext)
Immediately destroy a context, freeing up all resources associated with it. This does not
respect any users of the context, and might make other objects unusable.
"""
function unsafe_destroy!(ctx::CuContext)
if isvalid(ctx)
cuCtxDestroy_v2(ctx)
invalidate!(ctx)
end
end
Base.unsafe_convert(::Type{CUcontext}, ctx::CuContext) = ctx.handle
# NOTE: we don't implement `isequal` or `hash` in order to fall back to `===` and `objectid`
# as contexts are unique, and with primary device contexts identical handles might be
# returned after resetting the context (device) and all associated resources.
function Base.show(io::IO, ctx::CuContext)
if ctx.handle != C_NULL
fields = [@sprintf("%p", ctx.handle), @sprintf("instance %x", objectid(ctx))]
if !isvalid(ctx)
push!(fields, "invalidated")
end
print(io, "CuContext(", join(fields, ", "), ")")
else
print(io, "CuContext(NULL)")
end
end
## core context API
"""
push!(CuContext, ctx::CuContext)
Pushes a context on the current CPU thread.
"""
Base.push!(::Type{CuContext}, ctx::CuContext) = cuCtxPushCurrent_v2(ctx)
"""
pop!(CuContext)
Pops the current CUDA context from the current CPU thread.
"""
function Base.pop!(::Type{CuContext})
handle_ref = Ref{CUcontext}()
cuCtxPopCurrent_v2(handle_ref)
# we don't return the context here, because it may be unused
# (and constructing the unique object is expensive)
end
"""
activate(ctx::CuContext)
Binds the specified CUDA context to the calling CPU thread.
"""
activate(ctx::CuContext) = cuCtxSetCurrent(ctx)
function CuContext(f::Function, dev::CuDevice, args...)
ctx = CuContext(dev, args...) # implicitly pushes
try
f(ctx)
finally
pop!(CuContext)
unsafe_destroy!(ctx)
end
end
## primary context management
"""
CuPrimaryContext(dev::CuDevice)
Create a primary CUDA context for a given device.
Each primary context is unique per device and is shared with CUDA runtime API. It is meant
for interoperability with (applications using) the runtime API.
"""
struct CuPrimaryContext
dev::CuDevice
end
# we need to keep track of contexts derived from primary contexts,
# so that we can invalidate them when the primary context is reset.
const derived_contexts = Dict{CuPrimaryContext,CuContext}()
const derived_lock = ReentrantLock()
"""
CuContext(pctx::CuPrimaryContext)
Derive a context from a primary context.
Calling this function increases the reference count of the primary context. The returned
context *should not* be free with the `unsafe_destroy!` function that's used with ordinary
contexts. Instead, the refcount of the primary context should be decreased by calling
`unsafe_release!`, or set to zero by calling `unsafe_reset!`. The easiest way to do this is
by using the `do`-block syntax.
"""
function CuContext(pctx::CuPrimaryContext)
handle_ref = Ref{CUcontext}()
cuDevicePrimaryCtxRetain(handle_ref, pctx.dev)
ctx = UniqueCuContext(handle_ref[])
Base.@lock derived_lock derived_contexts[pctx] = ctx
return ctx
end
function CuContext(f::Function, pctx::CuPrimaryContext)
ctx = CuContext(pctx)
try
f(ctx)
finally
unsafe_release!(pctx)
end
end
"""
CUDA.unsafe_release!(pctx::CuPrimaryContext)
Lower the refcount of a context, possibly freeing up all resources associated with it. This
does not respect any users of the context, and might make other objects unusable.
"""
function unsafe_release!(pctx::CuPrimaryContext)
if driver_version() >= v"11"
cuDevicePrimaryCtxRelease_v2(dev)
else
cuDevicePrimaryCtxRelease(dev)
end
# if this releases the last reference, invalidate all derived contexts
if !isactive(pctx)
ctx = @lock derived_lock get(derived_contexts, pctx, nothing)
if ctx !== nothing
invalidate!(ctx)
end
end
end
"""
unsafe_reset!(pctx::CuPrimaryContext)
Explicitly destroys and cleans up all resources associated with a device's primary context
in the current process. Note that this forcibly invalidates all contexts derived from this
primary context, and as a result outstanding resources might become invalid.
"""
function unsafe_reset!(pctx::CuPrimaryContext)
if driver_version() >= v"11"
cuDevicePrimaryCtxReset_v2(pctx.dev)
else
cuDevicePrimaryCtxReset(pctx.dev)
end
# invalidate all derived contexts
ctx = @lock derived_lock get(derived_contexts, pctx, nothing)
if ctx !== nothing
invalidate!(ctx)
end
return
end
function state(pctx::CuPrimaryContext)
flags = Ref{Cuint}()
active = Ref{Cint}()
cuDevicePrimaryCtxGetState(pctx.dev, flags, active)
return (flags[], active[] == one(Cint))
end
"""
isactive(pctx::CuPrimaryContext)
Query whether a primary context is active.
"""
isactive(pctx::CuPrimaryContext) = state(pctx)[2]
"""
flags(pctx::CuPrimaryContext)
Query the flags of a primary context.
"""
flags(pctx::CuPrimaryContext) = state(pctx)[1]
"""
setflags!(pctx::CuPrimaryContext)
Set the flags of a primary context.
"""
function setflags!(pctx::CuPrimaryContext, flags)
if driver_version() >= v"11"
cuDevicePrimaryCtxSetFlags_v2(pctx.dev, flags)
else
cuDevicePrimaryCtxSetFlags(pctx.dev, flags)
end
end
## context properties
"""
device(::CuContext)
Returns the device for a context.
"""
function device(ctx::CuContext)
push!(CuContext, ctx)
dev = current_device()
pop!(CuContext)
return dev
end
"""
synchronize(ctx::Context)
Block for the all operations on `ctx` to complete. This is a heavyweight operation,
typically you only need to call [`synchronize`](@ref) which only synchronizes the stream
associated with the current task.
"""
function synchronize(ctx::CuContext)
push!(CuContext, ctx)
try
device_synchronize()
finally
pop!(CuContext)
end
end
# same, but without the context switch
"""
device_synchronize()
Block for the all operations on `ctx` to complete. This is a heavyweight operation,
typically you only need to call [`synchronize`](@ref) which only synchronizes the stream
associated with the current task.
On the device, `device_synchronize` acts as a synchronization point for child grids in the
context of dynamic parallelism.
"""
device_synchronize()
# XXX: can we put the device docstring in dynamic_parallelism.jl?
## cache config
export cache_config, cache_config!
@enum_without_prefix CUfunc_cache CU_
function cache_config()
config = Ref{CUfunc_cache}()
cuCtxGetCacheConfig(config)
return config[]
end
function cache_config!(config::CUfunc_cache)
cuCtxSetCacheConfig(config)
end
## shared memory config
export shmem_config, shmem_config!
@enum_without_prefix CUsharedconfig CU_
function shmem_config()
config = Ref{CUsharedconfig}()
cuCtxGetSharedMemConfig(config)
return config[]
end
function shmem_config!(config::CUsharedconfig)
cuCtxSetSharedMemConfig(config)
end
## limits
export limit, limit!
@enum_without_prefix CUlimit CU_
function limit(lim::CUlimit)
val = Ref{Csize_t}()
cuCtxGetLimit(val, lim)
return Int(val[])
end
limit!(lim::CUlimit, val) = cuCtxSetLimit(lim, val)
## p2p
export enable_peer_access, disable_peer_access
enable_peer_access(peer::CuContext, flags=0) =
cuCtxEnablePeerAccess(peer, flags)
disable_peer_access(peer::CuContext) = cuCtxDisablePeerAccess(peer)