In [0]:
! pwd

/content


In [0]:
! nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [0]:
! pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-jhh5jvny
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-jhh5jvny
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=4dbb2a6fe81bca37b353f3167ba8a3d8cf903fee91628cdc1f4c4c53fe1839b6
  Stored in directory: /tmp/pip-ephem-wheel-cache-x2zb7mus/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [0]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [0]:
###########################################################################################################3

**1)One block with number of threads equal to number of pixels. eg. gridDim=(1,1,1), blockDim=(m,n,1)**

In [0]:
%%cuda --name Matrixadd1.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

__global__ void matrixAdd(float *A,float *B, float *C, int n, int m)
{
	int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;
	int ind = (m * i) + j;
	if(i < n && j < m)
		C[ind] = A[ind] + B[ind];
}

int main(void)
{
	cudaError_t err=cudaSuccess;
	int n;
	printf("Enter number of rows for both matrices:");
	scanf("%d",&n);
	int m;
	printf("Enter number of columns for both matrices:");
	scanf("%d",&m);
		
	size_t size= n*m*sizeof(int);
	printf("Matrix addition of %d x %d elements.\n",n,m);
	//host size allocating memory
	int h_A[n][m];
	int h_B[n][m];
	int h_C[n][m];
	printf("Enter values of matrix A:\n");
	for(int i=0;i<n;++i)
	{
		for(int j=0;j<m;++j)
		{
			scanf("%d",&h_A[i][j]);
		}
	}
	printf("Enter values of matrix B:\n");
	for(int i=0;i<n;++i)
	{
		for(int j=0;j<m;++j)
		{
			scanf("%d",&h_B[i][j]);
		}
	}
	
	float *d_A,*d_B,*d_C;
	
	//device side allocation of memory
	err=cudaMalloc((void **)&d_A,size);
	err=cudaMalloc((void **)&d_B,size);
	err=cudaMalloc((void **)&d_C,size);
	
	//Copying the host values to device
	err=cudaMemcpy(d_A,h_A,size,cudaMemcpyHostToDevice);
	err=cudaMemcpy(d_B,h_B,size,cudaMemcpyHostToDevice);

  //1) one block with number of threads equal to number of pixels. eg. gridDim=(1,1,1), blockDim=(m,n,1)
	dim3 gridDim(1,1,1);
	dim3 blockDim(m,n,1);
		
	//matrix addition is executed in device side 
	matrixAdd<<<gridDim,blockDim>>>(d_A,d_B,d_C,n,m);
	
	err=cudaGetLastError();
	
	//copying back the values of vector C from device to host
	err=cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);
	
	printf("Matrix C values are: \n");
	for(int i=0;i<n;++i)
	{
		for(int j=0;j<m;++j)
		{
			printf("%d ",h_C[i][j]);
		}
		printf("\n");
	}
	
	//Free the device side variables
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
		
	for(int i=0;i<n;++i)
	{
		for(int j=0;j<m;j++)
		{
				if(fabs(h_A[i][j]+h_B[i][j]-h_C[i][j]) > 1e-5)
				{
					//If more than 1e-5 difference is there between A+B and value in C
					fprintf(stderr,"Result verification falied at element %d,%d\n",i,j);
					exit(EXIT_FAILURE);
				}
		}
	}
	
	printf("Test PASSED\n");
	return 0;
}

In [0]:
!nvcc /content/src/Matrixadd1.cu -o /content/src/Matrixadd1




In [0]:
!/content/src/Matrixadd1

Enter number of rows for both matrices:3
Enter number of columns for both matrices:3
Matrix addition of 3 x 3 elements.
Enter values of matrix A:
1
1
1
2
2
2
3
3
3
Enter values of matrix B:
1
1
1
1
1
1
1
1
1
Matrix C values are: 
2 2 2 
3 3 3 
4 4 4 
Test PASSED


In [0]:
###########################################################################################################

**2)GridDim= (adjusted according to values of m and n)(Ceil(n/4),Ceil(m/4),1), blockDim=(4,4,1)**

In [0]:
%%cuda --name Matrixadd2.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

__global__ void matrixAdd(float *A,float *B, float *C, int n, int m)
{
	int i = blockDim.x * blockIdx.x + threadIdx.x;
    int j = blockDim.y * blockIdx.y + threadIdx.y;
	int ind = (m * i) + j;
	if(i < n && j < m)
		C[ind] = A[ind] + B[ind];
}

int main(void)
{
	cudaError_t err=cudaSuccess;
	int n=4,m=4;
		
	size_t size= n*m*sizeof(int);
	printf("Matrix addition of %d x %d elements.\n",n,m);
	//host size allocating memory
	int h_A[n][m];
	int h_B[n][m];
	int h_C[n][m];
	printf("Enter values of matrix A:\n");
	for(int i=0;i<n;++i)
	{
		for(int j=0;j<m;++j)
		{
			scanf("%d",&h_A[i][j]);
		}
	}
	printf("Enter values of matrix B:\n");
	for(int i=0;i<n;++i)
	{
		for(int j=0;j<m;++j)
		{
			scanf("%d",&h_B[i][j]);
		}
	}
	
	float *d_A,*d_B,*d_C;
	
	//device side allocation of memory
	err=cudaMalloc((void **)&d_A,size);
	err=cudaMalloc((void **)&d_B,size);
	err=cudaMalloc((void **)&d_C,size);
	
	//Copying the host values to device
	err=cudaMemcpy(d_A,h_A,size,cudaMemcpyHostToDevice);
	err=cudaMemcpy(d_B,h_B,size,cudaMemcpyHostToDevice);

  //2) GridDim= (adjusted according to values of m and n)(Ceil(n/4),Ceil(m/4),1), blockDim=(4,4,1)
	dim3 gridDim(ceil(n/4),ceil(m/4),1);
	dim3 blockDim(4,4,1);
		
	//matrix addition is executed in device side 
	matrixAdd<<<gridDim,blockDim>>>(d_A,d_B,d_C,n,m);
	
	err=cudaGetLastError();
	
	//copying back the values of vector C from device to host
	err=cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);
	
	printf("Matrix C values are: \n");
	for(int i=0;i<n;++i)
	{
		for(int j=0;j<m;++j)
		{
			printf("%d ",h_C[i][j]);
		}
		printf("\n");
	}
	
	//Free the device side variables
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
		
	for(int i=0;i<n;++i)
	{
		for(int j=0;j<m;j++)
		{
				if(fabs(h_A[i][j]+h_B[i][j]-h_C[i][j]) > 1e-5)
				{
					//If more than 1e-5 difference is there between A+B and value in C
					fprintf(stderr,"Result verification falied at element %d,%d\n",i,j);
					exit(EXIT_FAILURE);
				}
		}
	}
	
	printf("Test PASSED\n");
	return 0;
}

In [0]:
!nvcc /content/src/Matrixadd2.cu -o /content/src/Matrixadd2




In [0]:
!/content/src/Matrixadd2

Matrix addition of 4 x 4 elements.
Matrix A:
2 9 10 4 
6 6 5 3 
9 9 2 6 
4 8 1 2 
Matrix B:
1 2 7 8 
1 3 7 10 
2 6 3 7 
9 9 4 4 
Matrix C values are: 
3 11 17 12 
7 9 12 13 
11 15 5 13 
13 17 5 6 
Test PASSED
