In [5]:
code = """
#include<iostream>
#include<cstdlib>
using namespace std;

__global__ void vectorAdd(int *a, int *b, int *result, int n) {
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    if(tid <= n) {
        result[tid] = a[tid] + b[tid];
    }
}

void print_array(int *a, int N) {
    for(int i=0; i<N; i++) {
        cout<<"  "<<a[i];
    }
    cout<<endl;
}

void init_array(int *a, int N) {
    for(int i=0; i<N; i++) {
        a[i] = rand()%10 + 1;
    }
}

int main() {
    int *a, *b, *c;
    int *a_dev, *b_dev, *c_dev;
    int n = 8;           //24
    
    a = (int*)malloc(n * sizeof(n));
    b = (int*)malloc(n * sizeof(n));
    c = (int*)malloc(n * sizeof(n));

    int size = n * sizeof(int);
    
    cudaMalloc(&a_dev, size);
    cudaMalloc(&b_dev, size);
    cudaMalloc(&c_dev, size);
    
    init_array(a, n);
    init_array(b, n);
    
    print_array(a, n);
    print_array(b, n);
        
    //cudaEvent_t start, end;
    //cudaEventCreate(&start);
    //cudaEventCreate(&end);
    
    cudaMemcpy(a_dev, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, size, cudaMemcpyHostToDevice);
    
    //int threads = 1024;
    //int blocks = (n+threads-1)/threads;
    
    //cudaEventRecord(start);
    
    //vectorAdd<<<blocks,threads>>>(a_dev, b_dev, c_dev, n);
    vectorAdd<<<1,1024>>>(a_dev, b_dev, c_dev, n);
    
    //cudaEventRecord(end);
    
    //cudaDeviceSynchronize();
       
    //float time = 0.0;
    //cudaEventElapsedTime(&time, start, end);
    
    cudaMemcpy(c, c_dev, size, cudaMemcpyDeviceToHost);
    
    cout<<"Results : "<<endl;
    print_array(c, n);
 
    //cout<<"Time elapsed : "<<time<<endl;
        
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
        
    return 0;
}
"""

In [6]:

text_file = open("assign2.cu", "w")
text_file.write(code)
text_file.close()

In [7]:

!nvcc assign2.cu


In [8]:
!nvprof ./a.out

==225== NVPROF is profiling process 225, command: ./a.out
  4  7  8  6  4  6  7  3
  10  2  3  8  1  10  4  7
Results : 
  14  9  11  14  5  16  11  10
==225== Profiling application: ./a.out
==225== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   38.74%  3.1360us         1  3.1360us  3.1360us  3.1360us  vectorAdd(int*, int*, int*, int)
                   38.34%  3.1040us         2  1.5520us  1.3120us  1.7920us  [CUDA memcpy HtoD]
                   22.92%  1.8560us         1  1.8560us  1.8560us  1.8560us  [CUDA memcpy DtoH]
      API calls:   99.56%  176.64ms         3  58.879ms  7.3040us  176.62ms  cudaMalloc
                    0.21%  369.59us         1  369.59us  369.59us  369.59us  cuDeviceTotalMem
                    0.08%  138.55us        97  1.4280us     135ns  58.422us  cuDeviceGetAttribute
                    0.07%  119.90us         3  39.967us  7.2080us  98.240us  cudaFree
                    0.04%  65.564