This repository has been archived by the owner on Mar 12, 2021. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 83
/
memory.jl
408 lines (347 loc) · 11.6 KB
/
memory.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
import Base.GC: gc
# dynamic memory pool allocator
#
# this allocator sits between CuArray constructors
# and the actual memory allocation in CUDAdrv.Mem
#
# the core design is a pretty simple:
# - bin allocations into multiple pools according to their size (see `poolidx`)
# - when requested memory, check the pool for unused memory, or allocate dynamically
# - conversely, when released memory, put it in the appropriate pool for future use
#
# to avoid memory hogging and/or trashing the Julia GC:
# - keep track of used and available memory, in order to determine the usage of each pool
# - keep track of each pool's usage, as well as a window of previous usages
# - regularly release memory from underused pools (see `reclaim(false)`)
#
# possible improvements:
# - pressure: have the `reclaim` background task reclaim more aggressively,
# and call it from the failure cascade in `alloc`
# - context management: either switch contexts when performing memory operations,
# or just use unified memory for all allocations.
# - per-device pools
const pool_lock = ReentrantLock()
## infrastructure
const pools_used = Vector{Set{Mem.Buffer}}()
const pools_avail = Vector{Vector{Mem.Buffer}}()
poolidx(n) = ceil(Int, log2(n))+1
poolsize(idx) = 2^(idx-1)
function create_pools(idx)
if length(pool_usage) >= idx
# fast-path without taking a lock
return
end
lock(pool_lock) do
while length(pool_usage) < idx
push!(pool_usage, 1)
push!(pool_history, initial_usage)
push!(pools_used, Set{Mem.Buffer}())
push!(pools_avail, Vector{Mem.Buffer}())
end
end
end
## management
const USAGE_WINDOW = 5
const initial_usage = Tuple(1 for _ in 1:USAGE_WINDOW)
const pool_usage = Vector{Float64}()
const pool_history = Vector{NTuple{USAGE_WINDOW,Float64}}()
# min and max time between successive background task iterations.
# when the pool usages don't change, scan less regularly.
#
# together with USAGE_WINDOW, this determines how long it takes for objects to get reclaimed
const MIN_DELAY = 1.0
const MAX_DELAY = 5.0
# debug stats
mutable struct PoolStats
# allocation requests
req_nalloc::Int
req_nfree::Int
## in bytes
req_alloc::Int
user_free::Int
# actual allocations
actual_nalloc::Int
actual_nfree::Int
## in bytes
actual_alloc::Int
actual_free::Int
cuda_time::Float64
total_time::Float64
# internal stats
alloc_1::Int
alloc_2::Int
alloc_3::Int
alloc_4::Int
end
const stats = PoolStats(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
Base.copy(stats::PoolStats) =
PoolStats((getfield(stats, field) for field in fieldnames(PoolStats))...)
# allocation traces
const pool_traces = Dict{Mem.Buffer, Tuple{Int, Base.StackTraces.StackTrace}}()
const tracing = parse(Bool, get(ENV, "CUARRAYS_TRACE_POOL", "false"))
function __init_memory__()
create_pools(30) # up to 512 MiB
managed = parse(Bool, get(ENV, "CUARRAYS_MANAGED_POOL", "true"))
if managed
delay = MIN_DELAY
@async begin
while true
if scan()
delay = MIN_DELAY
else
delay = min(delay*2, MAX_DELAY)
end
reclaim()
sleep(delay)
end
end
end
verbose = haskey(ENV, "CUARRAYS_MANAGED_POOL")
if verbose
atexit(()->begin
Core.println("""
Pool statistics (managed: $(managed ? "yes" : "no")):
- requested alloc/free: $(stats.req_nalloc)/$(stats.req_nfree) ($(Base.format_bytes(stats.req_nalloc))/$(Base.format_bytes(stats.req_free)))
- actual alloc/free: $(stats.actual_nalloc)/$(stats.actual_nfree) ($(Base.format_bytes(stats.actual_alloc))/$(Base.format_bytes(stats.actual_free)))
- alloc types: $(stats.alloc_1) $(stats.alloc_2) $(stats.alloc_3) $(stats.alloc_4)""")
end)
end
end
# scan every pool and manage the usage history
#
# returns a boolean indicating whether any pool is active (this can be a false negative)
function scan()
gc(false) # quick, incremental collection
lock(pool_lock) do
active = false
@inbounds for pid in 1:length(pool_history)
nused = length(pools_used[pid])
navail = length(pools_avail[pid])
history = pool_history[pid]
if nused+navail > 0
usage = pool_usage[pid]
current_usage = nused / (nused + navail)
if any(usage->usage != current_usage, history)
# shift the history window with the recorded usage
history = pool_history[pid]
pool_history[pid] = (Base.tail(pool_history[pid])..., usage)
# reset the usage with the current one
pool_usage[pid] = current_usage
end
if usage != current_usage
active = true
end
else
pool_usage[pid] = 1
pool_history[pid] = initial_usage
end
end
active
end
end
# reclaim free objects
function reclaim(full::Bool=false)
lock(pool_lock) do
stats.total_time += Base.@elapsed begin
if full
# reclaim all currently unused buffers
for (pid, pl) in enumerate(pools_avail)
for buf in pl
stats.actual_nfree += 1
stats.cuda_time += Base.@elapsed Mem.free(buf)
stats.actual_free += poolsize(pid)
end
empty!(pl)
end
else
# only reclaim really unused buffers
@inbounds for pid in 1:length(pool_usage)
nused = length(pools_used[pid])
navail = length(pools_avail[pid])
recent_usage = (pool_history[pid]..., pool_usage[pid])
if navail > 0
# reclaim as much as the usage allows
reclaimable = floor(Int, (1-maximum(recent_usage))*(nused+navail))
@assert reclaimable <= navail
while reclaimable > 0
buf = pop!(pools_avail[pid])
stats.actual_nfree += 1
stats.cuda_time += Base.@elapsed Mem.free(buf)
stats.actual_free += poolsize(pid)
reclaimable -= 1
end
end
end
end
end
end
end
const MAX_POOL = 100*1024^2 # 100 MiB
## interface
function alloc(bytes)
# 0-byte allocations shouldn't hit the pool
bytes == 0 && return Mem.alloc(0)
buf = Ref{Mem.Buffer}()
stats.req_nalloc += 1
stats.req_alloc += bytes
stats.total_time += Base.@elapsed begin
# do we even consider pooling?
pooling = bytes <= MAX_POOL
if pooling
pid = poolidx(bytes)
create_pools(pid)
alloc_bytes = poolsize(pid)
@inbounds used = pools_used[pid]
@inbounds avail = pools_avail[pid]
else
alloc_bytes = bytes
end
lock(pool_lock) do
# 1. find an unused buffer in our pool
if pooling && !isempty(avail)
stats.alloc_1 += 1
buf[] = pop!(avail)
else
try
# 2. didn't have one, so allocate a new buffer
stats.cuda_time += Base.@elapsed begin
buf[] = Mem.alloc(alloc_bytes)
end
stats.alloc_2 += 1
stats.actual_nalloc += 1
stats.actual_alloc += alloc_bytes
catch ex
ex == CUDAdrv.ERROR_OUT_OF_MEMORY || rethrow()
# 3. that failed; make Julia collect objects and check 1. again
gc(true) # full collection
if pooling && !isempty(avail)
stats.alloc_3 += 1
buf[] = pop!(avail)
else
# 4. didn't have one, so reclaim all other unused buffers and do 2. again
reclaim(true)
try
stats.cuda_time += Base.@elapsed begin
buf[] = Mem.alloc(alloc_bytes)
end
stats.alloc_4 += 1
stats.actual_nalloc += 1
stats.actual_alloc += alloc_bytes
catch ex
ex == CUDAdrv.ERROR_OUT_OF_MEMORY || rethrow()
if tracing
@error "Failed to allocate $(Base.format_bytes(bytes)) (requires $(Base.format_bytes(alloc_bytes)) buffer)"
for buf in keys(pool_traces)
bytes, bt = pool_traces[buf]
@warn "Outstanding allocation of $(Base.format_bytes(bytes)) (requires $(Base.format_bytes(buf.bytesize)) buffer)" exception=(ex,bt)
end
end
rethrow()
end
end
end
end
if pooling
# mark the buffer as used
push!(used, buf[])
# update pool usage
current_usage = length(used) / (length(avail) + length(used))
pool_usage[pid] = max(pool_usage[pid], current_usage)
end
end
end
if tracing
pool_traces[buf[]] = (bytes, stacktrace())
end
buf[]
end
function dealloc(buf, bytes)
# 0-byte allocations shouldn't hit the pool
bytes == 0 && return Mem.alloc(0)
stats.req_nfree += 1
stats.user_free += bytes
stats.total_time += Base.@elapsed begin
# was this a pooled buffer?
pooling = bytes <= MAX_POOL
if pooling
pid = poolidx(bytes)
@assert pid <= length(pools_used)
@inbounds used = pools_used[pid]
@inbounds avail = pools_avail[pid]
lock(pool_lock) do
# mark the buffer as available
delete!(used, buf)
push!(avail, buf)
# update pool usage
current_usage = length(used) / (length(used) + length(avail))
pool_usage[pid] = max(pool_usage[pid], current_usage)
end
else
Mem.free(buf)
end
end
if tracing
delete!(pool_traces, buf)
end
return
end
## utility macros
using Printf
macro allocated(ex)
quote
let
local f
function f()
b0 = stats.req_alloc
$(esc(ex))
stats.req_alloc - b0
end
f()
end
end
end
macro time(ex)
quote
local gpu_mem_stats0 = copy(stats)
local cpu_mem_stats0 = Base.gc_num()
local cpu_time0 = time_ns()
local val = $(esc(ex))
local cpu_time1 = time_ns()
local cpu_mem_stats1 = Base.gc_num()
local gpu_mem_stats1 = copy(stats)
local cpu_time = (cpu_time1 - cpu_time0) / 1e9
local gpu_gc_time = gpu_mem_stats1.cuda_time - gpu_mem_stats0.cuda_time
local gpu_lib_time = gpu_mem_stats1.cuda_time - gpu_mem_stats0.cuda_time
local gpu_alloc_count = gpu_mem_stats1.req_nalloc - gpu_mem_stats0.req_nalloc
local gpu_alloc_size = gpu_mem_stats1.req_alloc - gpu_mem_stats0.req_alloc
local cpu_mem_stats = Base.GC_Diff(cpu_mem_stats1, cpu_mem_stats0)
local cpu_gc_time = cpu_mem_stats.total_time / 1e9
local cpu_alloc_count = Base.gc_alloc_count(cpu_mem_stats)
local cpu_alloc_size = cpu_mem_stats.allocd
Printf.@printf("%10.6f seconds", cpu_time)
for (typ, gctime, libtime, bytes, allocs) in
(("CPU", cpu_gc_time, 0, cpu_alloc_size, cpu_alloc_count),
("GPU", gpu_gc_time, gpu_lib_time, gpu_alloc_size, gpu_alloc_count))
if bytes != 0 || allocs != 0
allocs, ma = Base.prettyprint_getunits(allocs, length(Base._cnt_units), Int64(1000))
if ma == 1
Printf.@printf(" (%d%s %s allocation%s: ", allocs, Base._cnt_units[ma], typ, allocs==1 ? "" : "s")
else
Printf.@printf(" (%.2f%s %s allocations: ", allocs, Base._cnt_units[ma], typ)
end
print(Base.format_bytes(bytes))
if gctime > 0
Printf.@printf(", %.2f%% gc time", 100*gctime/cpu_time)
if libtime > 0
Printf.@printf(" of which %.2f%% spent allocating", 100*libtime/gctime)
end
end
print(")")
elseif gctime > 0
Printf.@printf(", %.2f%% %s gc time", 100*gctime/cpu_time, typ)
end
end
println()
val
end
end