-
Notifications
You must be signed in to change notification settings - Fork 16
/
device_gpu.c
2653 lines (2454 loc) · 130 KB
/
device_gpu.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
*
* Copyright (c) 2021-2022 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*/
#include "parsec/parsec_config.h"
#include "parsec/mca/device/device.h"
#include "parsec/mca/device/device_gpu.h"
#include "parsec/utils/zone_malloc.h"
#include "parsec/constants.h"
#include "parsec/utils/debug.h"
#include "parsec/execution_stream.h"
#include "parsec/utils/argv.h"
#include "parsec/parsec_internal.h"
#include "parsec/scheduling.h"
#include <limits.h>
#define PARSEC_DEVICE_DATA_COPY_ATOMIC_SENTINEL 1024
#if defined(PARSEC_PROF_TRACE)
static int parsec_gpu_movein_key_start;
static int parsec_gpu_movein_key_end;
static int parsec_gpu_moveout_key_start;
static int parsec_gpu_moveout_key_end;
static int parsec_gpu_own_GPU_key_start;
static int parsec_gpu_own_GPU_key_end;
static int parsec_gpu_allocate_memory_key;
static int parsec_gpu_free_memory_key;
static int parsec_gpu_use_memory_key_start;
static int parsec_gpu_use_memory_key_end;
static int parsec_gpu_prefetch_key_start;
static int parsec_gpu_prefetch_key_end;
static int parsec_gpu_profiling_initiated = 0;
#endif /* defined(PROFILING) */
int parsec_gpu_output_stream = -1;
int parsec_gpu_verbosity;
static inline int
parsec_device_check_space_needed(parsec_device_gpu_module_t *gpu_device,
parsec_gpu_task_t *gpu_task)
{
int i;
int space_needed = 0;
parsec_task_t *this_task = gpu_task->ec;
parsec_data_t *original;
parsec_data_copy_t *data;
const parsec_flow_t *flow;
for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
/* Make sure data_in is not NULL */
if( NULL == this_task->data[i].data_in ) continue;
flow = gpu_task->flow[i];
if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
data = this_task->data[i].data_in;
if (data == NULL) continue;
original = data->original;
if( NULL != PARSEC_DATA_GET_COPY(original, gpu_device->super.device_index) ) {
continue;
}
if(flow->flow_flags & PARSEC_FLOW_ACCESS_READ)
space_needed++;
}
return space_needed;
}
#if defined(PARSEC_PROF_TRACE)
void parsec_device_init_profiling(void)
{
if(parsec_gpu_profiling_initiated == 0) {
parsec_profiling_add_dictionary_keyword("gpu", "fill:#66ff66",
0, NULL,
&parsec_gpu_own_GPU_key_start, &parsec_gpu_own_GPU_key_end);
parsec_profiling_add_dictionary_keyword("movein", "fill:#33FF33",
sizeof(parsec_profile_data_collection_info_t),
PARSEC_PROFILE_DATA_COLLECTION_INFO_CONVERTOR,
&parsec_gpu_movein_key_start, &parsec_gpu_movein_key_end);
parsec_profiling_add_dictionary_keyword("moveout", "fill:#ffff66",
sizeof(parsec_profile_data_collection_info_t),
PARSEC_PROFILE_DATA_COLLECTION_INFO_CONVERTOR,
&parsec_gpu_moveout_key_start, &parsec_gpu_moveout_key_end);
parsec_profiling_add_dictionary_keyword("prefetch", "fill:#66ff66",
sizeof(parsec_profile_data_collection_info_t),
PARSEC_PROFILE_DATA_COLLECTION_INFO_CONVERTOR,
&parsec_gpu_prefetch_key_start, &parsec_gpu_prefetch_key_end);
parsec_profiling_add_dictionary_keyword("gpu_mem_alloc", "fill:#FF66FF",
#if (PARSEC_SIZEOF_SIZE_T == 4)
sizeof(uint32_t), "size{uint32_t}",
#elif (PARSEC_SIZEOF_SIZE_T == 8)
sizeof(uint64_t), "size{uint64_t}",
#else
#error "Unsupported case: sizeof(size_t) is neither 8 nor 4"
#endif // PARSEC_SIZEOF_SIZE_T
&parsec_gpu_allocate_memory_key, &parsec_gpu_free_memory_key);
parsec_profiling_add_dictionary_keyword("gpu_mem_use", "fill:#FF66FF",
sizeof(parsec_device_gpu_memory_prof_info_t),
PARSEC_DEVICE_GPU_MEMORY_PROF_INFO_CONVERTER,
&parsec_gpu_use_memory_key_start, &parsec_gpu_use_memory_key_end);
parsec_gpu_profiling_initiated = 1;
}
}
#endif
void parsec_device_enable_debug(void)
{
if(parsec_gpu_output_stream == -1) {
parsec_gpu_output_stream = parsec_device_output;
if( parsec_gpu_verbosity >= 0 ) {
parsec_gpu_output_stream = parsec_output_open(NULL);
parsec_output_set_verbosity(parsec_gpu_output_stream, parsec_gpu_verbosity);
}
}
}
int parsec_device_sort_pending_list(parsec_device_module_t *device)
{
if( !PARSEC_DEV_IS_GPU(device->type) )
return 0;
parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)device;
parsec_list_t *sort_list = gpu_device->exec_stream[0]->fifo_pending;
if (parsec_list_is_empty(sort_list) ) { /* list is empty */
return 0;
}
if (gpu_device->sort_starting_p == NULL || !parsec_list_nolock_contains(sort_list, gpu_device->sort_starting_p) ) {
gpu_device->sort_starting_p = (parsec_list_item_t*)sort_list->ghost_element.list_next;
}
/* p is head */
parsec_list_item_t *p = gpu_device->sort_starting_p;
int i, j, NB_SORT = 10, space_q, space_min;
parsec_list_item_t *q, *prev_p, *min_p;
for (i = 0; i < NB_SORT; i++) {
if ( p == &(sort_list->ghost_element) ) {
break;
}
min_p = p; /* assume the minimum one is the first one p */
q = (parsec_list_item_t*)min_p->list_next;
space_min = parsec_device_check_space_needed(gpu_device, (parsec_gpu_task_t*)min_p);
for (j = i+1; j < NB_SORT; j++) {
if ( q == &(sort_list->ghost_element) ) {
break;
}
space_q = parsec_device_check_space_needed(gpu_device, (parsec_gpu_task_t*)q);
if ( space_min > space_q ) {
min_p = q;
space_min = space_q;
}
q = (parsec_list_item_t*)q->list_next;
}
if (min_p != p) { /* minimum is not the first one, let's insert min_p before p */
/* take min_p out */
parsec_list_item_ring_chop(min_p);
PARSEC_LIST_ITEM_SINGLETON(min_p);
prev_p = (parsec_list_item_t*)p->list_prev;
/* insert min_p after prev_p */
parsec_list_add_after( sort_list, prev_p, min_p);
}
p = (parsec_list_item_t*)min_p->list_next;
}
return 0;
}
void* parsec_device_pop_workspace(parsec_device_gpu_module_t* gpu_device,
parsec_gpu_exec_stream_t* gpu_stream, size_t size)
{
(void)gpu_device; (void)gpu_stream; (void)size;
void *work = NULL;
#if !defined(PARSEC_GPU_ALLOC_PER_TILE)
if (gpu_stream->workspace == NULL) {
gpu_stream->workspace = (parsec_gpu_workspace_t *)malloc(sizeof(parsec_gpu_workspace_t));
gpu_stream->workspace->total_workspace = PARSEC_GPU_MAX_WORKSPACE;
gpu_stream->workspace->stack_head = PARSEC_GPU_MAX_WORKSPACE - 1;
for( int i = 0; i < PARSEC_GPU_MAX_WORKSPACE; i++ ) {
gpu_stream->workspace->workspace[i] = zone_malloc( gpu_device->memory, size);
PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
"GPU[%s] Succeeded Allocating workspace %d (device_ptr %p)",
gpu_device->super.name,
i, gpu_stream->workspace->workspace[i]);
#if defined(PARSEC_PROF_TRACE)
if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) &&
(gpu_device->exec_stream[0]->prof_event_track_enable ||
gpu_device->exec_stream[1]->prof_event_track_enable)) {
parsec_profiling_trace_flags(gpu_stream->profiling,
parsec_gpu_allocate_memory_key, (int64_t)
gpu_stream->workspace->workspace[i], gpu_device->super.device_index,
&size, PARSEC_PROFILING_EVENT_COUNTER|PARSEC_PROFILING_EVENT_HAS_INFO);
}
#endif
}
}
if (gpu_stream->workspace->stack_head < 0) {
parsec_fatal("parsec_device_pop_workspace: user requested more than %d GPU workspaces which is the current hard-coded limit per GPU stream\n", PARSEC_GPU_MAX_WORKSPACE);
return NULL;
}
work = gpu_stream->workspace->workspace[gpu_stream->workspace->stack_head];
gpu_stream->workspace->stack_head --;
#endif /* !defined(PARSEC_GPU_ALLOC_PER_TILE) */
return work;
}
int parsec_device_push_workspace(parsec_device_gpu_module_t* gpu_device, parsec_gpu_exec_stream_t* gpu_stream)
{
(void)gpu_device; (void)gpu_stream;
#if !defined(PARSEC_GPU_ALLOC_PER_TILE)
gpu_stream->workspace->stack_head ++;
assert (gpu_stream->workspace->stack_head < PARSEC_GPU_MAX_WORKSPACE);
#endif /* !defined(PARSEC_GPU_ALLOC_PER_TILE) */
return 0;
}
int parsec_device_free_workspace(parsec_device_gpu_module_t * gpu_device)
{
(void)gpu_device;
#if !defined(PARSEC_GPU_ALLOC_PER_TILE)
int i, j;
for( i = 0; i < gpu_device->num_exec_streams; i++ ) {
parsec_gpu_exec_stream_t *gpu_stream = gpu_device->exec_stream[i];
if (gpu_stream->workspace != NULL) {
for (j = 0; j < gpu_stream->workspace->total_workspace; j++) {
#if defined(PARSEC_PROF_TRACE)
if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) &&
(gpu_device->exec_stream[0]->prof_event_track_enable ||
gpu_device->exec_stream[1]->prof_event_track_enable)) {
parsec_profiling_trace_flags(gpu_stream->profiling,
parsec_gpu_allocate_memory_key, (int64_t)
gpu_stream->workspace->workspace[i], gpu_device->super.device_index,
NULL, PARSEC_PROFILING_EVENT_COUNTER);
}
#endif
PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
"GPU[%s] Release workspace %d (device_ptr %p)",
gpu_device->super.name,
j, gpu_stream->workspace->workspace[j]);
zone_free( gpu_device->memory, gpu_stream->workspace->workspace[j] );
}
free(gpu_stream->workspace);
gpu_stream->workspace = NULL;
}
}
#endif /* !defined(PARSEC_GPU_ALLOC_PER_TILE) */
return 0;
}
#if defined(PARSEC_DEBUG_NOISIER)
char *parsec_device_describe_gpu_task( char *tmp, size_t len, parsec_gpu_task_t *gpu_task )
{
char buffer[64];
parsec_data_t *data;
switch( gpu_task->task_type ) {
case PARSEC_GPU_TASK_TYPE_KERNEL:
return parsec_task_snprintf(tmp, len, gpu_task->ec);
case PARSEC_GPU_TASK_TYPE_PREFETCH:
assert(NULL != gpu_task->ec);
assert(NULL != gpu_task->ec->data[0].data_in );
data = gpu_task->ec->data[0].data_in->original;
if( NULL == data || NULL == data->dc )
snprintf(tmp, len, "PREFETCH for unbound data %p", data);
else {
data->dc->key_to_string(data->dc, data->key, buffer, 64);
snprintf(tmp, len, "PREFETCH for %s (data %p)", buffer, data);
}
return tmp;
case PARSEC_GPU_TASK_TYPE_WARMUP:
assert(NULL != gpu_task->copy->original && NULL != gpu_task->copy->original->dc);
gpu_task->copy->original->dc->key_to_string(gpu_task->copy->original->dc, gpu_task->copy->original->key, buffer, 64);
snprintf(tmp, len, "WARMUP %s on device %d",
buffer, gpu_task->copy->device_index);
return tmp;
case PARSEC_GPU_TASK_TYPE_D2HTRANSFER:
snprintf(tmp, len, "Device to Host Transfer");
return tmp;
case PARSEC_GPU_TASK_TYPE_D2D_COMPLETE:
snprintf(tmp, len, "D2D Transfer Complete for data copy %p [ref_count %d]",
gpu_task->ec->data[0].data_out, gpu_task->ec->data[0].data_out->super.super.obj_reference_count);
return tmp;
default:
snprintf(tmp, len, "*** Internal Error: unknown gpu task type %d ***", gpu_task->task_type);
return tmp;
}
}
#endif
void parsec_device_dump_exec_stream(parsec_gpu_exec_stream_t* exec_stream)
{
char task_str[128];
int i;
parsec_debug_verbose(0, parsec_gpu_output_stream,
"Dev: GPU stream %d{%p} [events = %d, start = %d, end = %d, executed = %d]",
exec_stream->name, exec_stream, exec_stream->max_events, exec_stream->start, exec_stream->end,
exec_stream->executed);
for( i = 0; i < exec_stream->max_events; i++ ) {
if( NULL == exec_stream->tasks[i] ) continue;
parsec_debug_verbose(0, parsec_gpu_output_stream,
" %d: %s", i, parsec_task_snprintf(task_str, 128, exec_stream->tasks[i]->ec));
}
/* Don't yet dump the fifo_pending queue */
}
void parsec_device_dump_gpu_state(parsec_device_gpu_module_t* gpu_device)
{
int i;
uint64_t data_in_host, data_in_dev = 0;
data_in_host = gpu_device->super.data_in_from_device[0];
for(int i = 1; i < gpu_device->super.data_in_array_size; i++) {
data_in_dev += gpu_device->super.data_in_from_device[i];
}
parsec_output(parsec_gpu_output_stream, "\n\n");
parsec_output(parsec_gpu_output_stream, "Device %d:%d (%p) epoch\n", gpu_device->super.device_index,
gpu_device->super.device_index, gpu_device, gpu_device->data_avail_epoch);
parsec_output(parsec_gpu_output_stream, "\tpeer mask %x executed tasks with %llu streams %d\n",
gpu_device->peer_access_mask, (unsigned long long)gpu_device->super.executed_tasks, gpu_device->num_exec_streams);
parsec_output(parsec_gpu_output_stream, "\tstats transferred [in: %llu from host %llu from other device out: %llu] required [in: %llu out: %llu]\n",
(unsigned long long)data_in_host, (unsigned long long)data_in_dev,
(unsigned long long)gpu_device->super.data_out_to_host,
(unsigned long long)gpu_device->super.required_data_in, (unsigned long long)gpu_device->super.required_data_out);
for( i = 0; i < gpu_device->num_exec_streams; i++ ) {
parsec_device_dump_exec_stream(gpu_device->exec_stream[i]);
}
if( !parsec_list_is_empty(&gpu_device->gpu_mem_lru) ) {
parsec_output(parsec_gpu_output_stream, "#\n# LRU list\n#\n");
i = 0;
PARSEC_LIST_ITERATOR(&gpu_device->gpu_mem_lru, item,
{
parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item;
parsec_output(parsec_gpu_output_stream, " %d. elem %p flags 0x%x GPU mem %p\n",
i, gpu_copy, gpu_copy->flags, gpu_copy->device_private);
parsec_dump_data_copy(gpu_copy);
i++;
});
}
if( !parsec_list_is_empty(&gpu_device->gpu_mem_owned_lru) ) {
parsec_output(parsec_gpu_output_stream, "#\n# Owned LRU list\n#\n");
i = 0;
PARSEC_LIST_ITERATOR(&gpu_device->gpu_mem_owned_lru, item,
{
parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item;
parsec_output(parsec_gpu_output_stream, " %d. elem %p flags 0x%x GPU mem %p\n",
i, gpu_copy, gpu_copy->flags, gpu_copy->device_private);
parsec_dump_data_copy(gpu_copy);
i++;
});
}
parsec_output(parsec_gpu_output_stream, "\n\n");
}
static parsec_flow_t parsec_device_data_prefetch_flow = {
.name = "PREFETCH FLOW",
.flow_flags = PARSEC_FLOW_ACCESS_READ,
.flow_index = 0,
};
static parsec_task_class_t parsec_device_data_prefetch_tc = {
.name = "DEVICE PREFETCH",
.flags = 0,
.task_class_id = 0,
.nb_flows = 1,
.nb_parameters = 0,
.nb_locals = 0,
.dependencies_goal = 0,
.params = { NULL, },
.in = { &parsec_device_data_prefetch_flow, NULL },
.out = { NULL, },
.priority = NULL,
.properties = NULL,
.initial_data = NULL,
.final_data = NULL,
.data_affinity = NULL,
.key_functions = NULL,
.make_key = NULL,
.get_datatype = NULL,
.prepare_input = NULL,
.incarnations = NULL,
.prepare_output = NULL,
.find_deps = NULL,
.iterate_successors = NULL,
.iterate_predecessors = NULL,
.release_deps = NULL,
.complete_execution = NULL,
.new_task = NULL,
.release_task = NULL,
.fini = NULL
};
static int
parsec_device_release_resources_prefetch_task(parsec_device_gpu_module_t* gpu_device,
parsec_gpu_task_t** out_task)
{
#if defined(PARSEC_DEBUG_NOISIER)
char tmp[MAX_TASK_STRLEN];
#endif
parsec_gpu_task_t *gpu_task = *out_task;
(void)gpu_device;
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%s]: Releasing resources for task %s (%p with ec %p)",
gpu_device->super.name, parsec_device_describe_gpu_task(tmp, MAX_TASK_STRLEN, gpu_task),
gpu_task, gpu_task->ec);
assert( PARSEC_GPU_TASK_TYPE_PREFETCH == gpu_task->task_type );
PARSEC_DATA_COPY_RELEASE( gpu_task->ec->data[0].data_in);
free( gpu_task->ec );
gpu_task->ec = NULL;
return 0;
}
#if defined(PARSEC_DEBUG_NOISIER)
static char *parsec_device_debug_advice_to_string(int advice)
{
switch(advice) {
case PARSEC_DEV_DATA_ADVICE_PREFETCH:
return "Prefetch";
case PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE:
return "Set Preferred Device";
case PARSEC_DEV_DATA_ADVICE_WARMUP:
return "Mark data as recently used";
default:
assert(0);
return "Undefined advice";
}
}
#endif
int
parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int advice)
{
parsec_device_gpu_module_t* gpu_device = (parsec_device_gpu_module_t*)dev;
#if defined(PARSEC_DEBUG_NOISIER)
char buffer[64];
if(NULL != data->dc) {
data->dc->key_to_string(data->dc, data->key, buffer, 64);
} else {
snprintf(buffer, 64, "unbound data");
}
#endif
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%s]: User provides advice %s of %s (%p)",
gpu_device->super.name,
parsec_device_debug_advice_to_string(advice),
buffer,
data);
switch(advice) {
case PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE:
data->preferred_device = dev->device_index;
break;
case PARSEC_DEV_DATA_ADVICE_PREFETCH:
{
if( parsec_type_contiguous(data->device_copies[ data->owner_device ]->dtt) != PARSEC_SUCCESS){
parsec_warning( "%s:%d %s", __FILE__, __LINE__,
" PARSEC_DEV_DATA_ADVICE_PREFETCH cannot be applied to non contiguous types ");
return PARSEC_ERROR;
}
parsec_gpu_task_t* gpu_task = NULL;
gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t));
gpu_task->task_type = PARSEC_GPU_TASK_TYPE_PREFETCH;
gpu_task->ec = calloc(1, sizeof(parsec_task_t));
PARSEC_OBJ_CONSTRUCT(gpu_task->ec, parsec_task_t);
gpu_task->ec->task_class = &parsec_device_data_prefetch_tc;
gpu_task->flow[0] = &parsec_device_data_prefetch_flow;
gpu_task->flow_nb_elts[0] = data->device_copies[ data->owner_device ]->original->nb_elts;
gpu_task->stage_in = parsec_default_gpu_stage_in;
gpu_task->stage_out = parsec_default_gpu_stage_out;
PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p [ref_count %d] at %s:%d",
data->device_copies[ data->owner_device ],
data->device_copies[ data->owner_device ]->super.super.obj_reference_count,
__FILE__, __LINE__);
PARSEC_OBJ_RETAIN(data->device_copies[ data->owner_device ]);
gpu_task->ec->data[0].data_in = data->device_copies[ data->owner_device ];
gpu_task->ec->data[0].data_out = NULL;
gpu_task->ec->data[0].source_repo_entry = NULL;
gpu_task->ec->data[0].source_repo = NULL;
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
"GPU[%s]: data copy %p [ref_count %d] linked to prefetch gpu task %p on GPU copy %p [ref_count %d]",
gpu_device->super.name, gpu_task->ec->data[0].data_in, gpu_task->ec->data[0].data_in->super.super.obj_reference_count,
gpu_task, gpu_task->ec->data[0].data_out, gpu_task->ec->data[0].data_out->super.super.obj_reference_count);
parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task );
return PARSEC_SUCCESS;
}
break;
case PARSEC_DEV_DATA_ADVICE_WARMUP:
return PARSEC_ERR_NOT_IMPLEMENTED;
break;
default:
assert(0);
return PARSEC_ERR_NOT_FOUND;
}
return PARSEC_SUCCESS;
}
/**
* Register a taskpool with a device by checking that the device
* supports the dynamic function required by the different incarnations.
* If multiple devices of the same type exists we assume that all have
* the same capabilities.
*/
int
parsec_device_taskpool_register(parsec_device_module_t* device,
parsec_taskpool_t* tp)
{
parsec_device_gpu_module_t* gpu_device = (parsec_device_gpu_module_t*)device;
int32_t rc = PARSEC_ERR_NOT_FOUND;
uint32_t i, j;
/**
* Detect if a particular chore has a dynamic load dependency and if yes
* load the corresponding module and find the function.
*/
assert(PARSEC_DEV_IS_GPU(device->type));
assert(tp->devices_index_mask & (1 << device->device_index));
for( i = 0; i < tp->nb_task_classes; i++ ) {
const parsec_task_class_t* tc = tp->task_classes_array[i];
__parsec_chore_t* chores = (__parsec_chore_t*)tc->incarnations;
for( j = 0; NULL != chores[j].hook; j++ ) {
if( chores[j].type != device->type )
continue;
if( NULL != chores[j].dyld_fn ) {
/* the function has been set for another device of the same type */
return PARSEC_SUCCESS;
}
if ( NULL == chores[j].dyld ) {
chores[j].dyld_fn = NULL; /* No dynamic support required for this kernel */
rc = PARSEC_SUCCESS;
} else {
void* devf = gpu_device->find_incarnation(gpu_device, chores[j].dyld);
if( NULL != devf ) {
chores[j].dyld_fn = devf;
rc = PARSEC_SUCCESS;
}
}
}
}
if( PARSEC_SUCCESS != rc ) {
tp->devices_index_mask &= ~(1 << device->device_index); /* drop support for this device */
parsec_debug_verbose(10, parsec_gpu_output_stream,
"Device %d (%s) disabled for taskpool %p", device->device_index, device->name, tp);
}
return rc;
}
int
parsec_device_taskpool_unregister(parsec_device_module_t* device, parsec_taskpool_t* tp)
{
(void)device; (void)tp;
return PARSEC_SUCCESS;
}
/**
* Attach a device to a PaRSEC context. A device can only be attached to
* a single context at the time.
*/
int
parsec_device_attach( parsec_device_module_t* device, parsec_context_t* context )
{
return parsec_mca_device_add(context, device);
}
/**
* Detach a device from a context. Both the context and the device remain
* valid, they are simply disconnected.
* This function should only be called once all tasks and all data related to the
* context has been removed from the device.
*/
int
parsec_device_detach( parsec_device_module_t* device, parsec_context_t* context )
{
(void)context;
return parsec_mca_device_remove(device);
}
/**
* This function reserve the memory_percentage of the total device memory for PaRSEC.
* This memory will be managed in chunks of size eltsize. However, multiple chunks
* can be reserved in a single allocation.
*/
int
parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
int memory_percentage,
int number_blocks,
size_t eltsize )
{
int rc;
(void)eltsize;
size_t how_much_we_allocate;
size_t total_mem, initial_free_mem;
uint32_t mem_elem_per_gpu = 0;
rc = gpu_device->set_device(gpu_device);
if(PARSEC_SUCCESS != rc)
return rc;
/* Determine how much memory we can allocate */
rc = gpu_device->memory_info( gpu_device, &initial_free_mem, &total_mem );
if(PARSEC_SUCCESS != rc)
return rc;
if( number_blocks != -1 ) {
if( number_blocks == 0 ) {
parsec_warning("GPU[%s] Invalid argument: requesting 0 bytes of memory",
gpu_device->super.name);
return PARSEC_ERROR;
} else {
how_much_we_allocate = number_blocks * eltsize;
}
} else {
/** number_blocks == -1 means memory_percentage is used */
how_much_we_allocate = (memory_percentage * initial_free_mem) / 100;
}
if( how_much_we_allocate > initial_free_mem ) {
/** Handle the case of jokers who require more than 100% of memory,
* and eleventh case of computer scientists who don't know how
* to divide a number by another
*/
parsec_warning("GPU[%s] Requested %zd bytes on GPU device, but only %zd bytes are available -- reducing allocation to max available",
gpu_device->super.name, how_much_we_allocate, initial_free_mem);
how_much_we_allocate = initial_free_mem;
}
if( how_much_we_allocate < eltsize ) {
/** Handle another kind of jokers entirely, and cases of
* not enough memory on the device
*/
parsec_warning("GPU[%s] Cannot allocate at least one element",
gpu_device->super.name);
return PARSEC_ERROR;
}
#if defined(PARSEC_GPU_ALLOC_PER_TILE)
size_t free_mem = initial_free_mem;
/*
* We allocate a bunch of tiles that will be used
* during the computations
*/
while( (free_mem > eltsize )
&& ((total_mem - free_mem) < how_much_we_allocate) ) {
parsec_gpu_data_copy_t* gpu_elem;
void *device_ptr;
rc = gpu_device->memory_allocate(gpu_device, eltsize, &device_ptr);
if(PARSEC_SUCCESS != rc) {
size_t _free_mem, _total_mem;
gpu_device->memory_info(gpu_device, &_free_mem, &_total_mem );
parsec_inform("GPU[%s] Per context: free mem %zu total mem %zu (allocated tiles %u)",
gpu_device->super.name,_free_mem, _total_mem, mem_elem_per_gpu);
break;
}
gpu_elem = PARSEC_OBJ_NEW(parsec_data_copy_t);
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s] Allocate GPU copy %p [ref_count %d] for data [%p]",
gpu_device->super.name,gpu_elem, gpu_elem->super.obj_reference_count, NULL);
gpu_elem->device_private = (void*)(long)device_ptr;
gpu_elem->flags |= PARSEC_DATA_FLAG_PARSEC_OWNED;
gpu_elem->device_index = gpu_device->super.device_index;
mem_elem_per_gpu++;
PARSEC_OBJ_RETAIN(gpu_elem);
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s] Retain and insert GPU copy %p [ref_count %d] in LRU",
gpu_device->super.name, gpu_elem, gpu_elem->super.obj_reference_count);
parsec_list_push_back( &gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_elem );
gpu_device->memory_info( gpu_device, &free_mem, &total_mem );
}
if( 0 == mem_elem_per_gpu && parsec_list_is_empty( &gpu_device->gpu_mem_lru ) ) {
parsec_warning("GPU[%s] Cannot allocate memory on GPU %s. Skip it!", gpu_device->super.name, gpu_device->super.name);
}
else {
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s] Allocate %u tiles on the GPU memory",
gpu_device->super.name, mem_elem_per_gpu );
}
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s] Allocate %u tiles on the GPU memory", gpu_device->super.name, mem_elem_per_gpu);
#else
if( NULL == gpu_device->memory ) {
void* base_ptr;
/* We allocate all the memory on the GPU and we use our memory management. */
/* This computation leads to allocating more than available if we asked for more than GPU memory */
mem_elem_per_gpu = (how_much_we_allocate + eltsize - 1 ) / eltsize;
size_t total_size = (size_t)mem_elem_per_gpu * eltsize;
if (total_size > initial_free_mem) {
/* Mapping more than 100% of GPU memory is obviously wrong */
/* Mapping exactly 100% of the GPU memory ends up producing errors about __global__ function call is not configured */
/* Mapping 95% works with low-end GPUs like 1060, how much to let available for gpu runtime, I don't know how to calculate */
total_size = (size_t)((int)(.9*initial_free_mem / eltsize)) * eltsize;
mem_elem_per_gpu = total_size / eltsize;
}
rc = gpu_device->memory_allocate(gpu_device, total_size, &base_ptr);
if(PARSEC_SUCCESS != rc) {
parsec_warning("GPU[%s] Allocating %zu bytes of memory on the GPU device failed",
gpu_device->super.name, total_size);
gpu_device->memory = NULL;
return PARSEC_ERROR;
}
gpu_device->memory = zone_malloc_init( base_ptr, mem_elem_per_gpu, eltsize );
if( gpu_device->memory == NULL ) {
parsec_warning("GPU[%s] Cannot allocate memory on GPU %s. Skip it!",
gpu_device->super.name, gpu_device->super.name);
return PARSEC_ERROR;
}
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s] Allocate %u segments of size %d on the GPU memory",
gpu_device->super.name, mem_elem_per_gpu, eltsize );
}
#endif
gpu_device->mem_block_size = eltsize;
gpu_device->mem_nb_blocks = mem_elem_per_gpu;
return PARSEC_SUCCESS;
}
static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_device,
parsec_list_t* list)
{
parsec_list_item_t* item;
while(NULL != (item = parsec_list_pop_front(list)) ) {
parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item;
parsec_data_t* original = gpu_copy->original;
PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
"GPU[%s] Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p, in map %p",
gpu_device->super.name, gpu_copy, gpu_copy->device_private, gpu_copy->super.super.obj_reference_count,
original, (NULL != original ? original->dc : NULL));
assert( gpu_copy->device_index == gpu_device->super.device_index );
if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state ) {
parsec_warning("GPU[%s] still OWNS the master memory copy for data %d and it is discarding it!",
gpu_device->super.name, original->key);
}
assert(0 != (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
#if defined(PARSEC_GPU_ALLOC_PER_TILE)
gpu_device->memory_free( gpu_copy->device_private );
#else
#if defined(PARSEC_PROF_TRACE)
if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) &&
(gpu_device->exec_stream[0]->prof_event_track_enable ||
gpu_device->exec_stream[1]->prof_event_track_enable)) {
parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
parsec_gpu_free_memory_key, (int64_t)gpu_copy->device_private,
gpu_device->super.device_index,
NULL, PARSEC_PROFILING_EVENT_COUNTER);
parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
parsec_gpu_use_memory_key_end,
(uint64_t)gpu_copy->device_private,
gpu_device->super.device_index, NULL, 0);
}
#endif
zone_free( gpu_device->memory, (void*)gpu_copy->device_private );
#endif
gpu_copy->device_private = NULL;
/* At this point the data copies should have no attachment to a data_t. Thus,
* before we get here (aka below parsec_fini), the destructor of the data
* collection must have been called, releasing all the copies.
*/
PARSEC_OBJ_RELEASE(gpu_copy); assert(NULL == gpu_copy);
}
}
/**
* This function only flushes the data copies pending in LRU, and checks
* (in debug mode) that the entire allocated memory is free to use */
int
parsec_device_flush_lru( parsec_device_module_t *device )
{
size_t in_use;
parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device;
/* Free all memory on GPU */
parsec_device_memory_release_list(gpu_device, &gpu_device->gpu_mem_lru);
parsec_device_memory_release_list(gpu_device, &gpu_device->gpu_mem_owned_lru);
parsec_device_free_workspace(gpu_device);
#if !defined(PARSEC_GPU_ALLOC_PER_TILE) && !defined(_NDEBUG)
if( (in_use = zone_in_use(gpu_device->memory)) != 0 ) {
parsec_warning("GPU[%s] memory leak detected: %lu bytes still allocated on GPU",
device->name, in_use);
assert(0);
}
#endif
return PARSEC_SUCCESS;
}
/**
* This function release the GPU memory reserved for this device.
*
* One has to notice that all the data available on the GPU is stored in one of
* the two used to keep track of the allocated data, either the gpu_mem_lru or
* the gpu_mem_owner_lru. Thus, going over all the elements in these two lists
* should be enough to enforce a clean release.
*/
int
parsec_device_memory_release( parsec_device_gpu_module_t* gpu_device )
{
int rc;
rc = gpu_device->set_device(gpu_device);
if(PARSEC_SUCCESS != rc)
return rc;
parsec_device_flush_lru(&gpu_device->super);
#if !defined(PARSEC_GPU_ALLOC_PER_TILE)
assert( NULL != gpu_device->memory );
void* ptr = zone_malloc_fini(&gpu_device->memory);
rc = gpu_device->memory_free(gpu_device, ptr);
if(PARSEC_SUCCESS != rc) {
parsec_warning("Failed to free the GPU backend memory.");
return rc;
}
#endif
return PARSEC_SUCCESS;
}
/**
* Try to find memory space to move all data on the GPU. We attach a device_elem to
* a memory_elem as soon as a device_elem is available. If we fail to find enough
* available elements, we push all the elements handled during this allocation
* back into the pool of available device_elem, to be picked up by another call
* (this call will remove them from the current task).
* Returns:
* PARSEC_HOOK_RETURN_DONE: All gpu_mem/mem_elem have been initialized
* PARSEC_HOOK_RETURN_AGAIN: At least one flow is marked under transfer, task cannot be scheduled yet
* PARSEC_HOOK_RETURN_NEXT: The task needs to rescheduled
*/
static inline int
parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
parsec_gpu_task_t *gpu_task )
{
parsec_task_t *this_task = gpu_task->ec;
parsec_gpu_data_copy_t* temp_loc[MAX_PARAM_COUNT], *gpu_elem, *lru_gpu_elem;
parsec_data_t* master, *oldmaster;
const parsec_flow_t *flow;
int i, j, data_avail_epoch = 0, copy_readers_update = 0;
parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL;
#if defined(PARSEC_DEBUG_NOISIER)
char task_name[MAX_TASK_STRLEN];
parsec_task_snprintf(task_name, MAX_TASK_STRLEN, this_task);
#endif /* defined(PARSEC_DEBUG_NOISIER) */
(void)copy_readers_update; // potentially unused
/**
* Parse all the input and output flows of data and ensure all have
* corresponding data on the GPU available.
*/
for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
flow = gpu_task->flow[i];
assert( flow && (flow->flow_index == i) );
/* Skip CTL flows only */
if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s]:%s: Investigating flow %s:%d",
gpu_device->super.name, task_name, flow->name, i);
temp_loc[i] = NULL;
if (this_task->data[i].data_in == NULL)
continue;
master = this_task->data[i].data_in->original;
parsec_atomic_lock(&master->lock);
gpu_elem = PARSEC_DATA_GET_COPY(master, gpu_device->super.device_index);
this_task->data[i].data_out = gpu_elem;
/* There is already a copy on the device */
if( NULL != gpu_elem ) {
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s]:%s: Flow %s:%i has a copy on the device %p%s",
gpu_device->super.name, task_name,
flow->name, i, gpu_elem,
gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ? " [in transfer]" : "");
if ( gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ) {
/* The data is indeed under transfer, but as we always force an event at the end of this
* step, we do not need to have a special case for this, because the forced event will
* ensure the data will be available on the GPU by the time this task will move to the
* next step.
*/
}
parsec_atomic_unlock(&master->lock);
continue;
}
#if !defined(PARSEC_GPU_ALLOC_PER_TILE)
gpu_elem = PARSEC_OBJ_NEW(parsec_data_copy_t);
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s]:%s: Allocate GPU copy %p sz %zu [ref_count %d] for data %p",
gpu_device->super.name, task_name,
gpu_elem, gpu_task->flow_nb_elts[i], gpu_elem->super.super.obj_reference_count, master);
gpu_elem->flags = PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED;
malloc_data:
copy_readers_update = 0;
assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]);
if( NULL == gpu_elem->device_private ) {
#endif
find_another_data:
temp_loc[i] = NULL;
/* Look for a data_copy to free */
lru_gpu_elem = (parsec_gpu_data_copy_t*)parsec_list_pop_front(&gpu_device->gpu_mem_lru);
if( NULL == lru_gpu_elem ) {
/* We can't find enough room on the GPU. Insert the tiles in the begining of
* the LRU (in order to be reused asap) and return with error.
*/
release_temp_and_return:
#if defined(PARSEC_DEBUG_NOISIER)
PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
"GPU[%s]:%s:\tRequest space on GPU failed for flow %s index %d/%d for task %s",
gpu_device->super.name, task_name,
flow->name, i, this_task->task_class->nb_flows, task_name );
#endif /* defined(PARSEC_DEBUG_NOISIER) */
for( j = 0; j <= i; j++ ) {
/* This flow could be a control flow */
if( NULL == temp_loc[j] ) continue;
/* This flow could be non-parsec-owned, in which case we can't reclaim it */
if( 0 == (temp_loc[j]->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s]:%s:\tAdd copy %p [ref_count %d] back to the LRU list",
gpu_device->super.name, task_name,
temp_loc[j], temp_loc[j]->super.super.obj_reference_count);
/* push them at the head to reach them again at the next iteration */
parsec_list_push_front(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)temp_loc[j]);
}
#if !defined(PARSEC_GPU_ALLOC_PER_TILE)
PARSEC_OBJ_RELEASE(gpu_elem);
#endif
parsec_atomic_unlock(&master->lock);
return PARSEC_HOOK_RETURN_AGAIN;
}
PARSEC_LIST_ITEM_SINGLETON(lru_gpu_elem);
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s]:%s: Evaluate LRU-retrieved GPU copy %p [ref_count %d] original %p",
gpu_device->super.name, task_name,
lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count,
lru_gpu_elem->original);
if( gpu_mem_lru_cycling == lru_gpu_elem ) {
PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
"GPU[%s]: Cycle detected on allocating memory for %s",
gpu_device->super.name, task_name);
temp_loc[i] = lru_gpu_elem; /* save it such that it gets pushed back into the LRU */
goto release_temp_and_return;
}
/* If there are pending readers, let the gpu_elem loose. This is a weak coordination
* protocol between here and the parsec_device_data_stage_in, where the readers don't necessarily
* always remove the data from the LRU.
*/
if( 0 != lru_gpu_elem->readers ) {
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s]:%s: Drop LRU-retrieved GPU copy %p [readers %d, ref_count %d] original %p",
gpu_device->super.name, task_name,
lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original);
/* We do not add the copy back into the LRU. This means that for now this copy is not
* tracked via the LRU (despite being only used in read mode) and instead is dangling
* on other tasks. Thus, it will eventually need to be added back into the LRU when
* current task using it completes.
*/
goto find_another_data;
}
/* It's also possible that the ref_count of that element is bigger than 1
* In that case, it's because some task completion did not execute yet, and
* we need to keep it in the list until it reaches 1.
*/
if( lru_gpu_elem->super.super.obj_reference_count > 1 ) {
/* It's also possible (although unlikely) that we livelock here:
* if gpu_mem_lru has *only* elements with readers == 0 but
* ref_count > 1, then we might pop/push forever. We save the
* earliest element found and if we see it again it means we
* run over the entire list without finding a suitable replacement.
* We need to make progress on something else. This remains safe for as long as the
* LRU is only modified by a single thread (in this case the current thread).
*/
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s]:%s: Push back LRU-retrieved GPU copy %p [readers %d, ref_count %d] original %p",
gpu_device->super.name, task_name,
lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original);
assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super);