In [0]:
!pwd

/content


In [0]:
! nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [0]:
! pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-e64f17yg
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-e64f17yg
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=66cb8ab43ca65bd82d023524dccd0b3da4b87703f938fd9b699e70ba55ff7e40
  Stored in directory: /tmp/pip-ephem-wheel-cache-jbk8u275/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [0]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [0]:
%%cuda --name hello.cu
#include "stdio.h"
__global__ void add(int a,int b, int *c)
{
    *c=a+b;
}
int main()
{
    int a,b,c;
    int *dev_c;
    a=3;
    b=4;
    cudaMalloc((void**)&dev_c,sizeof(int));
    add<<<1,1>>>(a,b,dev_c);
    cudaMemcpy(&c,dev_c,sizeof(int), cudaMemcpyDeviceToHost);
    printf("%d + %d = %d",a,b,c);
    cudaFree(dev_c);
    return 0;
}

UsageError: Cell magic `%%cuda` not found.


In [0]:
!nvcc /content/src/hello.cu -o /content/src/hello

In [0]:
!/content/src/hello

3 + 4 = 7

In [0]:
%%cuda --name VectorAdd.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
__global__ void vectorAdd(const float *A,const float *B, float *C, int n)
{
	int i=blockDim.x * blockIdx.x + threadIdx.x;
	if(i<n)
	{
		C[i]=A[i]+B[i];
	}
}

int main(void)
{
	cudaError_t err=cudaSuccess;
	int n=0;
	scanf("%d",&n);
	
	size_t size= n * sizeof(float);
	printf("Vector addition of %d elements:\n",n);
	//host size allocating memory
	float *h_A=(float *)malloc(size);
	float *h_B=(float *)malloc(size);
	float *h_C=(float *)malloc(size);
	
	for(int i=0;i<n;++i)
	{
		scanf("%f",&h_A[i]);
	}
	for(int i=0;i<n;++i)
	{
		scanf("%f",&h_B[i]);
	}
	
	float *d_A,*d_B,*d_C;
	
	//device side allocation of memory
	err=cudaMalloc((void **)&d_A,size);
	err=cudaMalloc((void **)&d_B,size);
	err=cudaMalloc((void **)&d_C,size);
	
	//Copying the host values to device
	err=cudaMemcpy(d_A,h_A,size,cudaMemcpyHostToDevice);
	err=cudaMemcpy(d_B,h_B,size,cudaMemcpyHostToDevice);

	int threadsperblock=256; //according to size of vector
	int blocksPerGrid= (n+threadsperblock-1)/threadsperblock;
	
	printf("Cuda kernel launch with %d blocks with %d threads per block\n", blocksPerGrid,threadsperblock);
	
	//vector addition is executed in device side 
	vectorAdd<<<blocksPerGrid,threadsperblock>>>(d_A,d_B,d_C,n);
	
	err=cudaGetLastError();
	
	//copying back the values of vector C from device to host
	err=cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);
	
	printf("Vector C values are: \n");
	for(int i=0;i<n;++i)
	{
		printf("%f\n",h_C[i]);
	}
	
	//Free the device side variables
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
		
	for(int i=0;i<n;++i)
	{
		if(fabs(h_A[i]+h_B[i]-h_C[i]) > 1e-5)
		{
			//If more than 1e-5 difference is there between A+B and value in C
			fprintf(stderr,"Result verification falied at element %d\n",i);
			exit(EXIT_FAILURE);
		}
	}
	
	printf("Test PASSED\n");
}

In [0]:
!nvcc /content/src/Vectoradd.cu -o /content/src/Vectoradd




In [0]:
!/content/src/Vectoradd

5
Vector addition of 5 elements:
1
2
3
4
5
1
1
1
1
1
Cuda kernel launch with 1 blocks with 256 threads per block
Vector C values are: 
2.000000
3.000000
4.000000
5.000000
6.000000
Test PASSED
