Loading nvidia compilers

In [2]:
# Checking version of CUDA
!nvcc --version
!nvidia-smi


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Tue_May_27_02:21:03_PDT_2025
Cuda compilation tools, release 12.9, V12.9.86
Build cuda_12.9.r12.9/compiler.36037853_0
Mon Nov 10 16:41:29 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...    Off |   00000000:01:00.0 Off |                  N/A |
| N/A   45C    P0              7W /   60W |      24MiB /   4096MiB |      4%      Default |
|                       

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [4]:
%%writefile VecAdd.cu
#include<stdio.h>

__global__ void add_vectors(float *ad, float *bd, int N)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < N)
        ad[index] += bd[index];                             //adding values in GPU memory
}
int main()
{
	float *a,*b;
	float *ad,*bd;
    int N = 1024;                                  //size of vector to be added
    unsigned int i, No_of_blocks, No_of_threads;
	size_t size = sizeof(float) * N;

    //allocate memory on host
	a=(float*)malloc(size);
	b=(float*)malloc(size);

    //allocate memory on device
	cudaMalloc(&ad,size);
	//printf("\nAfter cudaMalloc for ad\n%s\n",cudaGetErrorString(cudaGetLastError()));
	cudaMalloc(&bd,size);
	//printf("\nAfter cudaMalloc for bd\n%s\n",cudaGetErrorString(cudaGetLastError()));

    //initialize host memory with its own indices
    for(i=0; i<N; i++)
    {
        a[i]=(float)i;
        b[i]= -(float)i;
    }

	//copy data from host memory to device memory
	cudaMemcpy(ad,a,size,cudaMemcpyHostToDevice);
    //printf("\nAfter HostToDevice Memcpy for ad\n%s\n",cudaGetErrorString(cudaGetLastError()));
	cudaMemcpy(bd,b,size,cudaMemcpyHostToDevice);
    //printf("\nAfter HostToDevice Memcpy for bd\n%s\n",cudaGetErrorString(cudaGetLastError()));

    //calculate execution configuration
    if (N > 512)
    {
        No_of_threads = 512;
        No_of_blocks = (N / 512) + (((N % 512) == 0) ? 0 : 1);

    }
    else
    {
        No_of_threads = N;
        No_of_blocks = 1;
    }
    dim3 block (No_of_threads, 1, 1);
    dim3 grid (No_of_blocks, 1, 1);

    //GPU timer code
    float time;
    cudaEvent_t start,stop;
	cudaEventCreate(&start);
    cudaEventCreate(&stop);
	cudaEventRecord(start,0);

    //launch kernel with only one thread
    add_vectors<<< grid, block >>>(ad, bd, N);

    cudaEventRecord(stop,0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time,start,stop);			//time taken in kernel call calculated
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

    //copy back the results
	cudaMemcpy(a,ad,size,cudaMemcpyDeviceToHost);
	//printf("\nAfter DeviceToHost Memcpy for a\n%s\n",cudaGetErrorString(cudaGetLastError()));

	//print the results
	printf("\nAddition of above two VECTORS on GPU evaluates to = \n");
    for (i = 0; i < N; i++)
        printf("%f\n", a[i]);                       //if correctly evaluated, all values will be 0
    printf("\n\nTime taken is %f (ms)\n",time);

    //deallocate host and device memories
    cudaFree(ad); cudaFree(bd);
	free(a);free(b);


    return 1;
}

Overwriting VecAdd.cu


In [5]:
!ls

 Additionoftwomatrices.cu  'Inroduction to  GPGPU and CUDA.ppt'
 Additionoftwonumbers.cu   'Instructions to Run CUDA on Google Colab.docx'
 Additionoftwovectors.cu    VecAdd.cu
'CUDA Programming.ppt'	    VecAdd.ipynb


In [6]:
!nvcc VecAdd.cu

In [7]:
!./a.out


Addition of above two VECTORS on GPU evaluates to = 
0.000000
1.000000
2.000000
3.000000
4.000000
5.000000
6.000000
7.000000
8.000000
9.000000
10.000000
11.000000
12.000000
13.000000
14.000000
15.000000
16.000000
17.000000
18.000000
19.000000
20.000000
21.000000
22.000000
23.000000
24.000000
25.000000
26.000000
27.000000
28.000000
29.000000
30.000000
31.000000
32.000000
33.000000
34.000000
35.000000
36.000000
37.000000
38.000000
39.000000
40.000000
41.000000
42.000000
43.000000
44.000000
45.000000
46.000000
47.000000
48.000000
49.000000
50.000000
51.000000
52.000000
53.000000
54.000000
55.000000
56.000000
57.000000
58.000000
59.000000
60.000000
61.000000
62.000000
63.000000
64.000000
65.000000
66.000000
67.000000
68.000000
69.000000
70.000000
71.000000
72.000000
73.000000
74.000000
75.000000
76.000000
77.000000
78.000000
79.000000
80.000000
81.000000
82.000000
83.000000
84.000000
85.000000
86.000000
87.000000
88.000000
89.000000
90.000000
91.000000
92.000000
93.000000
94.000000
95.000

In [None]:
!nvcc -arch=sm_86 -gencode=arch=compute_86,code=sm_86 VecAdd.cu -o VecAdd

In [10]:
!./VecAdd


Addition of above two VECTORS on GPU evaluates to = 
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0