In [1]:
!pip install nvcc4jupyter



In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmptgmpqsa3".


In [11]:
%%cuda -c "--gpu-architecture sm_75 -O2 --default-stream per-thread"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda_runtime.h>

// same kernel with different name
__global__ void Kernel_00(float*x, int len)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;

	if(tid < len) {
		float sum = x[tid];
		int iter = 0;

		while(iter++ < len) {
			sum += 1;
		}
		x[tid] = sum;
	}

}

__global__ void Kernel_01(float*x, int len)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;

	if(tid < len) {
		float sum = x[tid];
		int iter = 0;

		while(iter++ < len) {
			sum += 1;
		}
		x[tid] = sum;
	}

}


int main()
{
	const int streamsNum = 2;
	int N=1<<10; // 1Kibi

	int threads_per_block = 128;
	int num_blocks = (threads_per_block + N - 1) / threads_per_block;

	std::cout << "Running " << N << " (floats) as the input data size." << std::endl;
	std::cout << "Launching " << streamsNum << " cuda streams." << std::endl;

	// device property

	cudaDeviceProp device_prop;
	int devID = 0; // use 1st device as default
	cudaGetDeviceProperties(&device_prop, devID);

	// query for priority range
	int priority_l;
	int priority_h;

	cudaDeviceGetStreamPriorityRange(&priority_l, &priority_h);
	printf("Stream priority range: LOW: %d to HIGH: %d on %s\n", priority_l, priority_h, device_prop.name);

  // paged memory
	float *h_a, *h_b;
	cudaMallocHost((void**)&h_a, sizeof(float) * N);
	cudaMallocHost((void**)&h_b, sizeof(float) * N);

	memset(h_a, 0, sizeof(float) * N);
	memset(h_b, 0, sizeof(float) * N);

	// device
	float *d_a, *d_b;
	cudaMalloc((void**)&d_a, sizeof(float) * N);
	cudaMalloc((void**)&d_b, sizeof(float) * N);

	// streams
	cudaStream_t streams[streamsNum];
	cudaEvent_t  events[streamsNum]; // events for streams

	for(int i=0; i<streamsNum; i++) {
		cudaStreamCreate(&streams[i]);
		cudaEventCreate(&events[i]);
	}

	// configure priority for streams
	cudaStreamCreateWithPriority(&streams[0], cudaStreamNonBlocking, priority_l); // stream 0 with low priority
	cudaStreamCreateWithPriority(&streams[1], cudaStreamNonBlocking, priority_h); // stream 1 with high priority

	//cudaStreamCreateWithPriority(&streams[0], cudaStreamNonBlocking, priority_h); // stream 0 with low priority
	//cudaStreamCreateWithPriority(&streams[1], cudaStreamNonBlocking, priority_l); // stream 1 with high priority

	// h2d
	cudaMemcpyAsync(d_a, h_a, sizeof(float)*N, cudaMemcpyHostToDevice, streams[0]);
	cudaMemcpyAsync(d_b, h_b, sizeof(float)*N, cudaMemcpyHostToDevice, streams[1]);

	// low priority kernel
	Kernel_00 <<< num_blocks, threads_per_block, 0, streams[0] >>> (d_a, N); // a + x
	cudaEventRecord(events[0], streams[0]);

	// high priority kernel
	Kernel_01 <<< num_blocks, threads_per_block, 0, streams[1] >>> (d_b, N); // b + x
	cudaEventRecord(events[1], streams[1]);

	cudaEventSynchronize(events[0]);
	cudaEventSynchronize(events[1]);

	// d2h
	cudaMemcpyAsync(h_a, d_a, sizeof(float)*N, cudaMemcpyDeviceToHost, streams[0]);
	cudaMemcpyAsync(h_b, d_b, sizeof(float)*N, cudaMemcpyDeviceToHost, streams[1]);

	cudaDeviceSynchronize(); // NOTE: this is needed to make sure prev dev opt is done!

	// check results
	int error_a = 0;
	for(int i=0; i<N; i++) {
		if(h_a[i] != N) {
			printf("h_a[%d] = %f\n",i, h_a[i]);
			error_a += 1;
		}
	}
	if(error_a == 0) {
		printf("Pass test on h_a!\n");
	}

	int error_b = 0;
	for(int i=0; i<N; i++) {
		if(h_b[i] != N) {
			printf("h_b[%d] = %f\n",i, h_b[i]);
			error_b += 1;
		}
	}
	if(error_b == 0) {
		printf("Pass test on h_b!\n");
	}
	// free
	for(int i=0; i<streamsNum; i++) {
		cudaStreamDestroy(streams[i]);
		cudaEventDestroy(events[i]);
	}

	cudaFree(d_a);
	cudaFree(d_b);

	cudaFreeHost(h_a);
	cudaFreeHost(h_b);

	return 0;
}

Running 1024 (floats) as the input data size.
Launching 2 cuda streams.
Stream priority range: LOW: 0 to HIGH: -5 on Tesla T4
Pass test on h_a!
Pass test on h_b!



In [4]:
%%cuda -c "--gpu-architecture sm_75 -O2 --default-stream per-thread"
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda_runtime.h>

// same kernel with different name
__global__ void Kernel_00(float*x, int len)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;

	if(tid < len) {
		float sum = x[tid];
		int iter = 0;

		while(iter++ < len) {
			sum += 1;
		}
		x[tid] = sum;
	}

}

__global__ void Kernel_01(float*x, int len)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;

	if(tid < len) {
		float sum = x[tid];
		int iter = 0;

		while(iter++ < len) {
			sum += 1;
		}
		x[tid] = sum;
	}

}


int main()
{
	const int streamsNum = 2;
	int N=1<<10; // 1Kibi

	int threads_per_block = 128;
	int num_blocks = (threads_per_block + N - 1) / threads_per_block;

	std::cout << "Running " << N << " (floats) as the input data size." << std::endl;
	std::cout << "Launching " << streamsNum << " cuda streams." << std::endl;

	// device property

	cudaDeviceProp device_prop;
	int devID = 0; // use 1st device as default
	cudaGetDeviceProperties(&device_prop, devID);

	// query for priority range
	int priority_l;
	int priority_h;

	cudaDeviceGetStreamPriorityRange(&priority_l, &priority_h);
	printf("Stream priority range: LOW: %d to HIGH: %d on %s\n", priority_l, priority_h, device_prop.name);

  // paged memory
	float *h_a, *h_b;
	cudaMallocHost((void**)&h_a, sizeof(float) * N);
	cudaMallocHost((void**)&h_b, sizeof(float) * N);

	memset(h_a, 0, sizeof(float) * N);
	memset(h_b, 0, sizeof(float) * N);

	// device
	float *d_a, *d_b;
	cudaMalloc((void**)&d_a, sizeof(float) * N);
	cudaMalloc((void**)&d_b, sizeof(float) * N);

	// streams
	cudaStream_t streams[streamsNum];
	cudaEvent_t  events[streamsNum]; // events for streams

	for(int i=0; i<streamsNum; i++) {
		cudaStreamCreate(&streams[i]);
		cudaEventCreate(&events[i]);
	}

	// configure priority for streams
	//cudaStreamCreateWithPriority(&streams[0], cudaStreamNonBlocking, priority_l); // stream 0 with low priority
	//cudaStreamCreateWithPriority(&streams[1], cudaStreamNonBlocking, priority_h); // stream 1 with high priority

	cudaStreamCreateWithPriority(&streams[0], cudaStreamNonBlocking, priority_h); // stream 0 with low priority
	cudaStreamCreateWithPriority(&streams[1], cudaStreamNonBlocking, priority_l); // stream 1 with high priority

	// h2d
	cudaMemcpyAsync(d_a, h_a, sizeof(float)*N, cudaMemcpyHostToDevice, streams[0]);
	cudaMemcpyAsync(d_b, h_b, sizeof(float)*N, cudaMemcpyHostToDevice, streams[1]);

	// low priority kernel
	Kernel_00 <<< num_blocks, threads_per_block, 0, streams[0] >>> (d_a, N); // a + x
	cudaEventRecord(events[0], streams[0]);

	// high priority kernel
	Kernel_01 <<< num_blocks, threads_per_block, 0, streams[1] >>> (d_b, N); // b + x
	cudaEventRecord(events[1], streams[1]);

	cudaEventSynchronize(events[0]);
	cudaEventSynchronize(events[1]);

	// d2h
	cudaMemcpyAsync(h_a, d_a, sizeof(float)*N, cudaMemcpyDeviceToHost, streams[0]);
	cudaMemcpyAsync(h_b, d_b, sizeof(float)*N, cudaMemcpyDeviceToHost, streams[1]);

	cudaDeviceSynchronize(); // NOTE: this is needed to make sure prev dev opt is done!

	// check results
	int error_a = 0;
	for(int i=0; i<N; i++) {
		if(h_a[i] != N) {
			printf("h_a[%d] = %f\n",i, h_a[i]);
			error_a += 1;
		}
	}
	if(error_a == 0) {
		printf("Pass test on h_a!\n");
	}

	int error_b = 0;
	for(int i=0; i<N; i++) {
		if(h_b[i] != N) {
			printf("h_b[%d] = %f\n",i, h_b[i]);
			error_b += 1;
		}
	}
	if(error_b == 0) {
		printf("Pass test on h_b!\n");
	}
	// free
	for(int i=0; i<streamsNum; i++) {
		cudaStreamDestroy(streams[i]);
		cudaEventDestroy(events[i]);
	}

	cudaFree(d_a);
	cudaFree(d_b);

	cudaFreeHost(h_a);
	cudaFreeHost(h_b);

	return 0;
}

Running 1024 (floats) as the input data size.
Launching 2 cuda streams.
Stream priority range: LOW: 0 to HIGH: -5 on Tesla T4
Pass test on h_a!
Pass test on h_b!

