/
benchmark.cu
82 lines (63 loc) · 2.47 KB
/
benchmark.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#include <vector>
#include <cassert>
#include <cstdlib>
#include <cstdio>
// Benchmarks a single permutation kernel accross multiple runs.
// Returns the average running time in milliseconds.
float benchmark(void* kernel, dim3 block_count, dim3 block_size, int input_size, int runs = 1)
{
assert(runs > 0);
// Generate the input array.
std::vector<int> input_h;
for (int i = 0; i < input_size; i++) {
input_h.push_back(rand());
}
// Copy the input to the GPU.
int *input_d, *output_d;
cudaMalloc(&input_d, input_size * sizeof(int));
cudaMalloc(&output_d, input_size * sizeof(int));
cudaMemcpy(input_d, input_h.data(), input_size * sizeof(int), cudaMemcpyHostToDevice);
// Get the time measurements.
std::vector<float> times;
for (int i = 0; i < runs; i++) {
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
std::vector<void*> args = { (void*)&input_d, (void*)&output_d };
cudaEventRecord(start);
cudaLaunchKernel(kernel, block_count, block_size, args.data());
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
times.push_back(milliseconds);
}
// No need to copy the output back to the CPU, simply free the GPU memory.
cudaFree(input_d);
cudaFree(output_d);
// Compute the average time per run.
float total = 0;
for (float t : times) total += t;
return total / (float)runs;
}
// The copy kernel we use to get the maximum effective bandwidth.
// You should launch exactly as many threads as there are array elements.
__global__ void copy_kernel(const int* input, int* output)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
output[i] = input[i];
}
// If you wish to benchmark any of the generated permutation kernels,
// paste the generated CUDA code here.
// We give as an example the code to benchmark the copy kernel as well commented out
// code to benchmark a BPC kernel (you have to fill in the correct dimensions).
int main()
{
int input_size = 1 << 25;
int runs = 100;
float copy_time = benchmark((void*)copy_kernel, { (unsigned int)input_size / 1024, 1, 1 }, { 1024, 1, 1 }, input_size, runs);
//float bpc_time = benchmark((void*)my_bpc_kernel, { ???, 1, 1 }, { ???, ???, 1 }, input_size, runs);
printf("Copy time (input size=%d) : %.3fms\n", input_size, copy_time);
//printf("BPC time (input size=%d) : %.3fms\n", input_size, bpc_time);
return 0;
}