In [1]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [2]:
%load_ext nvcc4jupyter

Detected platform "Kaggle". Running its setup...
Updating the package lists...
Installing nvidia-cuda-toolkit, this may take a few minutes...
Source files will be saved in "/tmp/tmpmaswe_ma".


reference code: 

In [None]:
%%cuda -c "--gpu-architecture sm_75 -O2 --default-stream per-thread"
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define NUM_STREAMS 4  // streams per GPU

typedef struct {
	int len;
	float *h_input, *h_output;
	float *d_input, *d_output;
	cudaStream_t streams[NUM_STREAMS];  // Multiple streams per GPU
} MGPUdata;

__global__ void testKernel(float *x, float *y, int len) {
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	if (tid < len) {
		y[tid] = x[tid] + len;  // O(1) instead of O(N)
	}
}

int main(int argc, char **argv) {
	int GpuNum = 0;
	if (cudaGetDeviceCount(&GpuNum) != cudaSuccess || GpuNum == 0) {
		printf("No CUDA devices found!\n");
		return 1;
	}
	printf("CUDA devices = %i\n", GpuNum);

	const int N = 100000;
	MGPUdata mgpu[GpuNum];

	int threads_per_block = 128;
	int num_blocks = (N + threads_per_block - 1) / threads_per_block;

	for (int i = 0; i < GpuNum; i++) {
		cudaSetDevice(i);

		// Allocate memory
		cudaMalloc((void**)&mgpu[i].d_input, sizeof(float) * N);
		cudaMalloc((void**)&mgpu[i].d_output, sizeof(float) * N);
		cudaMallocHost((void**)&mgpu[i].h_input, sizeof(float) * N);
		cudaMallocHost((void**)&mgpu[i].h_output, sizeof(float) * N);

		cudaMemset(mgpu[i].h_input, 0, sizeof(float) * N);

		// Create multiple streams
		for (int s = 0; s < NUM_STREAMS; s++) {
			cudaStreamCreate(&mgpu[i].streams[s]);
		}
	}

	int chunk_size = N / NUM_STREAMS;

	for (int i = 0; i < GpuNum; i++) {
		cudaSetDevice(i);

		for (int s = 0; s < NUM_STREAMS; s++) {
			int offset = s * chunk_size;

			cudaMemcpyAsync(mgpu[i].d_input + offset, mgpu[i].h_input + offset,
			                chunk_size * sizeof(float), cudaMemcpyHostToDevice, mgpu[i].streams[s]);

			int stream_blocks = (chunk_size + threads_per_block - 1) / threads_per_block;
			testKernel<<<stream_blocks, threads_per_block, 0, mgpu[i].streams[s]>>>(
				mgpu[i].d_input + offset, mgpu[i].d_output + offset, chunk_size);

			cudaMemcpyAsync(mgpu[i].h_output + offset, mgpu[i].d_output + offset,
			                chunk_size * sizeof(float), cudaMemcpyDeviceToHost, mgpu[i].streams[s]);
		}
	}

	// Synchronize and cleanup
	for (int i = 0; i < GpuNum; i++) {
		cudaSetDevice(i);

		// Sync all streams
		for (int s = 0; s < NUM_STREAMS; s++) {
			cudaStreamSynchronize(mgpu[i].streams[s]);
			cudaStreamDestroy(mgpu[i].streams[s]);
		}

		if (mgpu[i].h_input) cudaFreeHost(mgpu[i].h_input);
		if (mgpu[i].h_output) cudaFreeHost(mgpu[i].h_output);
		if (mgpu[i].d_input) cudaFree(mgpu[i].d_input);
		if (mgpu[i].d_output) cudaFree(mgpu[i].d_output);
	}

	return 0;
}

In [8]:
%%cuda -c "--gpu-architecture sm_75 -O2 --default-stream per-thread"
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define NUM_STREAMS 4  // streams per GPU

typedef struct {
	int len;
	float *h_input, *h_output;
	float *d_input, *d_output;
	cudaStream_t streams[NUM_STREAMS];  // Multiple streams per GPU
} MGPUdata;

__global__ void testKernel(float *x, float *y, int len) {
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	if (tid < len) {
		y[tid] = x[tid] + len;  // O(1) instead of O(N)
	}
}

int main(int argc, char **argv) {
	int GpuNum = 0;
	if (cudaGetDeviceCount(&GpuNum) != cudaSuccess || GpuNum == 0) {
		printf("No CUDA devices found!\n");
		return 1;
	}
	printf("CUDA devices = %i\n", GpuNum);

	const int N = 100000;
	MGPUdata mgpu[GpuNum];

	int threads_per_block = 128;
	int num_blocks = (N + threads_per_block - 1) / threads_per_block;

	for (int i = 0; i < GpuNum; i++) {
		cudaSetDevice(i);

		// Allocate memory
		cudaMalloc((void**)&mgpu[i].d_input, sizeof(float) * N);
		cudaMalloc((void**)&mgpu[i].d_output, sizeof(float) * N);
		cudaMallocHost((void**)&mgpu[i].h_input, sizeof(float) * N);
		cudaMallocHost((void**)&mgpu[i].h_output, sizeof(float) * N);

		cudaMemset(mgpu[i].h_input, 0, sizeof(float) * N);

		// Create multiple streams
		for (int s = 0; s < NUM_STREAMS; s++) {
			cudaStreamCreate(&mgpu[i].streams[s]);
		}
	}

	int chunk_size = N / NUM_STREAMS;

	for (int i = 0; i < GpuNum; i++) {
		cudaSetDevice(i);

		for (int s = 0; s < NUM_STREAMS; s++) {
			int offset = s * chunk_size;

			cudaMemcpyAsync(mgpu[i].d_input + offset, mgpu[i].h_input + offset,
			                chunk_size * sizeof(float), cudaMemcpyHostToDevice, mgpu[i].streams[s]);

			int stream_blocks = (chunk_size + threads_per_block - 1) / threads_per_block;
			testKernel<<<stream_blocks, threads_per_block, 0, mgpu[i].streams[s]>>>(
				mgpu[i].d_input + offset, mgpu[i].d_output + offset, chunk_size);

			cudaMemcpyAsync(mgpu[i].h_output + offset, mgpu[i].d_output + offset,
			                chunk_size * sizeof(float), cudaMemcpyDeviceToHost, mgpu[i].streams[s]);
		}
	}

	// Synchronize and cleanup
	for (int i = 0; i < GpuNum; i++) {
		cudaSetDevice(i);

		// Sync all streams
		for (int s = 0; s < NUM_STREAMS; s++) {
			cudaStreamSynchronize(mgpu[i].streams[s]);
			cudaStreamDestroy(mgpu[i].streams[s]);
		}

		if (mgpu[i].h_input) cudaFreeHost(mgpu[i].h_input);
		if (mgpu[i].h_output) cudaFreeHost(mgpu[i].h_output);
		if (mgpu[i].d_input) cudaFree(mgpu[i].d_input);
		if (mgpu[i].d_output) cudaFree(mgpu[i].d_output);
	}

	return 0;
}

CUDA devices = 2

