In [1]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpxw_910cv".


In [3]:
%%writefile stream_sync.cu
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda_runtime.h>

__global__ void testKernel(float*x, int len)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	if(tid < len) {
		float sum = x[tid];
		int iter = 0;
		while(iter++ < len) {
			sum += 1;
		}
		x[tid] = sum;
	}
}

int main(int argc, char **argv)
{
	const int streamsNum = 2;
	int N=1<<10; // 1K

	if (argc == 2) {
		N = atoi(argv[1]);
	}

	if (argc > 2) {
		fprintf(stderr, "Too many arguments! ./stream_sync N .\n");
		exit(1);
	}

	std::cout << "Running " << N << " (floats) as the input data size." << std::endl;
	std::cout << "Launching " << streamsNum << " cuda streams." << std::endl;

	float *h_a, *h_b;

	cudaMallocHost((void**)&h_a, sizeof(float) * N);
	cudaMallocHost((void**)&h_b, sizeof(float) * N);

	// init
	for(int i=0; i<N; i++) {
		h_a[i] = 0;
		h_b[i] = 0;
	}

	// device
	float *d_a, *d_b;

	cudaMalloc((void**)&d_a, sizeof(float) * N);
	cudaMalloc((void**)&d_b, sizeof(float) * N);

	// streams
	cudaStream_t streams[streamsNum];
	for(int i=0; i<streamsNum; i++) {
		cudaStreamCreate(&streams[i]);
	}

	// h2d
	cudaMemcpyAsync(d_a, h_a, sizeof(float)*N, cudaMemcpyHostToDevice, streams[0]);
	cudaMemcpyAsync(d_b, h_b, sizeof(float)*N, cudaMemcpyHostToDevice, streams[1]);

	// kernel
	int thread_per_block = 128;
  int blocks_per_grid = (N + thread_per_block - 1) / thread_per_block;

	testKernel <<< blocks_per_grid, thread_per_block, 0, streams[0] >>> (d_a, N);
	testKernel <<< blocks_per_grid, thread_per_block, 0, streams[1] >>> (d_b, N);

	// d2h
	cudaMemcpyAsync(h_a, d_a, sizeof(float)*N, cudaMemcpyDeviceToHost, streams[0]);
	cudaMemcpyAsync(h_b, d_b, sizeof(float)*N, cudaMemcpyDeviceToHost, streams[1]);

	cudaDeviceSynchronize(); // Ensure previous operations are done

	int error_a = 0;
	for(int i=0; i<N; i++) {
		if(h_a[i] != N) {
			printf("h_a[%d] = %f\n",i, h_a[i]);
			error_a += 1;
		}
	}
	if(error_a == 0) {
		printf("Pass test on h_a!\n");
    printf("h_a[%d] = %f\n",0, h_a[0]);

	}

	int error_b = 0;
	for(int i=0; i<N; i++) {
		if(h_b[i] != N) {
			printf("h_b[%d] = %f\n",i, h_b[i]);
			error_b += 1;
		}
	}
	if(error_b == 0) {
		printf("Pass test on h_b!\n");
    printf("h_b[%d] = %f\n",0, h_b[0]);

	}

	// free
	for(int i=0; i<streamsNum; i++) {
		cudaStreamDestroy(streams[i]);
	}

	cudaFree(d_a);
	cudaFree(d_b);

	cudaFreeHost(h_a);
	cudaFreeHost(h_b);

	return 0;
}


Writing stream_sync.cu


In [6]:
!nvcc -O2 --default-stream per-thread -arch=sm_75 stream_sync.cu -o stream_sync

In [7]:
!./stream_sync 16

Running 16 (floats) as the input data size.
Launching 2 cuda streams.
Pass test on h_a!
h_a[0] = 16.000000
Pass test on h_b!
h_b[0] = 16.000000
