In [9]:
using NVTX
using CUDA

device!(5)

CuDevice(5): Tesla V100-SXM2-32GB

1. Kernal functions on the GPU:

In [10]:
function kernelA(a)
    i = threadIdx().x
    a[i] *= 2
    return nothing
end

function kernelB(b)
    i = threadIdx().x
    b[i] += 3
    return nothing
end
nothing

2. Main function:

In [11]:
function test1(N, a, b, stream1, stream2)
    @sync begin
        @async begin
            NVTX.@range "stream_1_kernel" @cuda threads=N stream=stream1 kernelA(a)
        end
        @async begin
            NVTX.@range "stream_2_kernel" @cuda threads=N stream=stream2 kernelB(b)
        end
    end
    # synchronize the two streams
    CUDA.@sync(stream1)
    CUDA.@sync(stream2)
    # copy the results back to host
    a_host = Array(a)
    b_host = Array(b)
    return a_host, b_host
end
nothing

3. Prepare the data

In [12]:
# initialize two vectors
N = 256
a = CUDA.fill(1.0f0, N)
b = CUDA.fill(1.0f0, N)

# create two streams on GPU
stream1 = CuStream()
stream2 = CuStream()
nothing

4. Run with Nsight System

In [13]:
CUDA.@profile external=true a_host, b_host = test1(N, a, b, stream1, stream2)

(Float32[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0  …  2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0], Float32[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0  …  4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0])

![image.png](./assets/p01.png)

5. Serial version:

In [None]:
function test2(N, a, b)
    NVTX.@range "kernel_1" @cuda threads=N kernelA(a)
    NVTX.@range "kernel_2" @cuda threads=N kernelB(b)
    # copy the results back to host
    a_host = Array(a)
    b_host = Array(b)
    return a_host, b_host
end
CUDA.@profile external=true a_host, b_host = test2(N, a, b)
nothing

![image.png](./assets/p02.png)