In [2]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpn87h0etl".


In [17]:
%%cuda
#include<assert.h>
#include<stdio.h>
#include<cuda.h>
#include<time.h>

void prodottoCPU(float *a, float *b, float *c, int n);
__global__ void prodottoGPU(float* a, float* b, float* c, float* u, int n);

int main(void) {
	float *a_h, *b_h, *c_h; // host data
	float *a_d, *b_d, *c_d; // device data
	float *u_d, *u_h;
	int N, nBytes, i, nBlockBytes, s_size;
	dim3 gridDim, blockDim;
	float elapsed_gpu, elapsed_cpu;
	cudaEvent_t start_gpu, stop_gpu, start_cpu, stop_cpu;

  N = 320000;
  blockDim.x = 32;

	//determinazione esatta del numero di blocchi
	gridDim = N / blockDim.x + ((N%blockDim.x)==0?0:1);

	nBytes = N * sizeof(float);
	nBlockBytes = gridDim.x * sizeof(float);
	a_h = (float*)malloc(nBytes);
	b_h = (float*)malloc(nBytes);
	c_h = (float*)malloc(nBytes);
	u_h = (float*)malloc(nBlockBytes);
	cudaMalloc((void**)&a_d, nBytes);
	cudaMalloc((void**)&b_d, nBytes);
	cudaMalloc((void**)&c_d, nBytes);
	cudaMalloc((void**)&u_d, nBlockBytes);

	// inizializzo i dati
	// Inizializza la generazione random dei vettori utilizzando l'ora attuale del sistema
	srand((unsigned int) time(0));

	for (i=0; i<N; i++) {
		a_h[i] = rand() % 5 - 2;
		b_h[i] = rand() % 5 - 2;;
	}

	cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
	cudaMemcpy(b_d, b_h, nBytes, cudaMemcpyHostToDevice);

	// Azzeriamo il contenuto del vettore c
	cudaMemset(c_d, 0, nBytes);

	s_size = blockDim.x * sizeof(float);

	// Invocazione del kernel
	cudaEventCreate(&start_gpu);
	cudaEventCreate(&stop_gpu);
	cudaEventRecord(start_gpu);
	prodottoGPU<<<gridDim, blockDim, s_size>>>(a_d, b_d, c_d, u_d, N);
	cudaMemcpy(u_h, u_d, nBlockBytes, cudaMemcpyDeviceToHost);
	float sommaGPU = 0;
	for(int i = 0; i < gridDim.x; i++){
		sommaGPU += u_h[i];
	}
	cudaEventRecord(stop_gpu);
	cudaEventSynchronize(stop_gpu);
	cudaEventElapsedTime(&elapsed_gpu, start_gpu, stop_gpu);
	cudaEventDestroy(start_gpu);
	cudaEventDestroy(stop_gpu);

	cudaEventCreate(&start_cpu);
	cudaEventCreate(&stop_cpu);
	cudaEventRecord(start_cpu);
	// Calcolo somma seriale su CPU
	prodottoCPU(a_h, b_h, c_h, N);
	float sommaCPU = 0;
	for(int i = 0; i < N; i++){
		sommaCPU += c_h[i];
	}
	cudaEventRecord(stop_cpu);
	cudaEventSynchronize(stop_cpu);

	cudaEventElapsedTime(&elapsed_cpu, start_cpu, stop_cpu);
	cudaEventDestroy(start_cpu);
	cudaEventDestroy(stop_cpu);

	if (N < 20){
		for(i=0;i<N;i++)
			printf("a_h[%d]=%6.2f ", i, a_h[i]);
		printf("\n");
		for(i=0;i<N;i++)
			printf("b_h[%d]=%6.2f ", i, b_h[i]);
		printf("\n");
		for(i=0;i<gridDim.x;i++)
			printf("u_h[%d]=%6.2f ", i, u_h[i]);
		printf("\n");
	}

	printf("s_GPU = %6.2f\n", sommaGPU);
	printf("s_CPU = %6.2f\n", sommaCPU);
	printf("time_GPU = %f\n", elapsed_gpu);
	printf("time_CPU = %f\n", elapsed_cpu);

	assert(sommaGPU == sommaCPU);

	free(a_h);
	free(b_h);
	free(c_h);
	free(u_h);
	cudaFree(a_d);
	cudaFree(b_d);
	cudaFree(c_d);
	cudaFree(u_d);
	return 0;
}

// Host
void prodottoCPU(float *a, float *b, float *c, int n) {
	int i;
	for(i=0;i<n;i++)
		c[i]=a[i]*b[i];
}

// Device
__global__ void prodottoGPU(float* a, float * b, float* c, float* u, int n) {
	extern __shared__ float s[];
	int index = threadIdx.x + blockIdx.x*blockDim.x;
	if(index < n)
		s[threadIdx.x] = a[index]*b[index];
	__syncthreads();

	// Somma da ricombinare
	int dist = blockDim.x;

	for(int k = 1; k < dist; k *= 2){
		if(threadIdx.x % (2 * k) == 0){
			s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x + k];
		}
		__syncthreads();
	}

	if(threadIdx.x == 0){
		u[blockIdx.x] = s[0];
	}
}


s_GPU = 398.00
s_CPU = 398.00
time_GPU = 0.282496
time_CPU = 2.520608

