/
performance.jl
113 lines (89 loc) · 4.15 KB
/
performance.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
using OpenCL, Printf, LinearAlgebra
const bench_kernel = "
__kernel void sum(__global const float *a,
__global const float *b,
__global float *c)
{
int gid = get_global_id(0);
float a_temp;
float b_temp;
float c_temp;
a_temp = a[gid]; // my a element (by global ref)
b_temp = b[gid]; // my b element (by global ref)
c_temp = a_temp+b_temp; // sum of my elements
c_temp = c_temp * c_temp; // product of sums
c_temp = c_temp * (a_temp/2.0f); // times 1/2 my a
c[gid] = c_temp; // store result in global memory
}"
function cl_performance(ndatapts::Integer, nworkers::Integer)
@assert ndatapts > 0
@assert nworkers > 0
a = rand(Float32, ndatapts)
b = rand(Float32, ndatapts)
c = Vector{Float32}(undef, ndatapts)
@printf("Size of test data: %i MB\n", sizeof(a) / 1024 / 1024)
t1 = time()
for i in 1:ndatapts
c_temp = a[i] + b[i]
c_temp = c_temp * c_temp
c[i] = c_temp * (a[i] / 2f0)
end
t2 = time()
@printf("Julia Execution time: %.4f seconds\n", t2 - t1)
for platform in cl.platforms()
if platform[:name] == "Portable Computing Language"
@warn("Portable Computing Language platform not yet supported")
continue
end
for device in cl.available_devices(platform)
@printf("====================================================\n")
@printf("Platform name: %s\n", platform[:name])
@printf("Platform profile: %s\n", platform[:profile])
@printf("Platform vendor: %s\n", platform[:vendor])
@printf("Platform version: %s\n", platform[:version])
@printf("----------------------------------------------------\n")
@printf("Device name: %s\n", device[:name])
@printf("Device type: %s\n", device[:device_type])
@printf("Device mem: %i MB\n", device[:global_mem_size] / 1024^2)
@printf("Device max mem alloc: %i MB\n", device[:max_mem_alloc_size] / 1024^2)
@printf("Device max clock freq: %i MHZ\n", device[:max_clock_frequency])
@printf("Device max compute units: %i\n", device[:max_compute_units])
@printf("Device max work group size: %i\n", device[:max_work_group_size])
@printf("Device max work item size: %s\n", device[:max_work_item_size])
if device[:max_mem_alloc_size] < sizeof(Float32) * ndatapts
@warn("Requested buffer size exceeds device max alloc size!")
@warn("Skipping device $(device[:name])...")
continue
end
if device[:max_work_group_size] < nworkers
@warn("Number of workers exceeds the device's max work group size!")
@warn("Skipping device $(device[:name])...")
continue
end
ctx = cl.Context(device)
queue = cl.CmdQueue(ctx, :profile)
a_buf = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=a)
b_buf = cl.Buffer(Float32, ctx, (:r, :copy), hostbuf=b)
c_buf = cl.Buffer(Float32, ctx, :w, length(a))
prg = cl.Program(ctx, source=bench_kernel) |> cl.build!
kern = cl.Kernel(prg, "sum")
# work_group_multiple = kern[:prefered_work_group_size_multiple]
global_size = (ndatapts,)
local_size = (nworkers,)
# call the kernel
evt = kern[queue, global_size, local_size](a_buf, b_buf, c_buf)
# duration in ns
t = evt[:profile_duration] * 1e-9
@printf("Execution time of test: %.4f seconds\n", t)
c_device = cl.read(queue, c_buf)
@info("Result norm: $(norm(c - c_device))")
end
end
end
# Play with these numbers to see performance differences
# N_DATAPTS has to be a multiple of the number of workers
# N_WORKERS has to be less than or equal to the device's max work group size
# ex. N_WORKERS = 1 is non parallel execution on the gpu
const N_DATA_PTS = Int(2^23) # ~8 million
const N_WORKERS = Int(2^7)
cl_performance(N_DATA_PTS, N_WORKERS)