In [50]:
code = """
#include <iostream>
#include <cuda.h>

__global__
void vectorAddition(const int* A, const int* B, int* C, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < size) {
        C[tid] = A[tid] + B[tid];
    }
}

int main() {
    int size = 1000000;  // Size of the vectors
    int* A, * B, * C;    // Host vectors
    int* d_A, * d_B, * d_C;  // Device vectors

    // Allocate memory for host vectors
    A = new int[size];
    B = new int[size];
    C = new int[size];

    // Initialize input vectors
    for (int i = 0; i < size; ++i) {
        A[i] = 1;
        B[i] = 2;
    }

    // Allocate memory for device vectors
    cudaMalloc((void**)&d_A, size * sizeof(int));
    cudaMalloc((void**)&d_B, size * sizeof(int));
    cudaMalloc((void**)&d_C, size * sizeof(int));

    // Copy input vectors from host to device
    cudaMemcpy(d_A, A, size * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size * sizeof(int), cudaMemcpyHostToDevice);

    // Define the number of threads per block and the number of blocks
    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;

    // Launch the vector addition kernel
    vectorAddition<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, size);

    // Copy the result from the device to the host
    cudaMemcpy(C, d_C, size * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the result
    for (int i = 0; i < size; ++i) {
        if(C[i]!=3)
          std::cout<<"error";
    }
    std::cout <<"success "<< std::endl;

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Free host memory
    delete[] A;
    delete[] B;
    delete[] C;

    return 0;
}

"""


In [51]:
text_file = open("assign1.cu", "w")
text_file.write(code)
text_file.close()

In [52]:
!nvcc assign1.cu

In [53]:
!./a.out

success 


In [54]:
!nvprof ./a.out

==8431== NVPROF is profiling process 8431, command: ./a.out
success 
==8431== Profiling application: ./a.out
==8431== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   51.86%  1.6000ms         1  1.6000ms  1.6000ms  1.6000ms  [CUDA memcpy DtoH]
                   46.62%  1.4382ms         2  719.11us  688.98us  749.24us  [CUDA memcpy HtoD]
                    1.52%  47.039us         1  47.039us  47.039us  47.039us  vectorAddition(int const *, int const *, int*, int)
      API calls:   97.47%  212.07ms         3  70.690ms  69.182us  211.93ms  cudaMalloc
                    2.19%  4.7655ms         3  1.5885ms  932.41us  2.8717ms  cudaMemcpy
                    0.25%  554.24us         3  184.75us  136.71us  210.97us  cudaFree
                    0.06%  120.52us       101  1.1930us     127ns  51.553us  cuDeviceGetAttribute
                    0.01%  30.347us         1  30.347us  30.347us  30.347us  cudaLaunchKernel
       

In [55]:
code2 = """
#include <iostream>
#include <cuda.h>

const int N = 4;  // Matrix size

// Kernel for matrix multiplication
__global__
void matrixMultiplication(const int* A, const int* B, int* C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < n && col < n) {
        int sum = 0;
        for (int k = 0; k < n; ++k) {
            sum += A[row * n + k] * B[k * n + col];
        }
        C[row * n + col] = sum;
    }
}

int main() {
    int A[N][N] = {{1, 2, 3, 4},
                   {5, 6, 7, 8},
                   {9, 10, 11, 12},
                   {13, 14, 15, 16}};

    int B[N][N] = {{1, 0, 0, 0},
                   {0, 1, 0, 0},
                   {0, 0, 1, 0},
                   {0, 0, 0, 1}};

    int C[N][N] = {0};

    int* d_A, * d_B, * d_C;  // Device matrices

    // Allocate memory for device matrices
    cudaMalloc((void**)&d_A, N * N * sizeof(int));
    cudaMalloc((void**)&d_B, N * N * sizeof(int));
    cudaMalloc((void**)&d_C, N * N * sizeof(int));

    // Copy input matrices from host to device
    cudaMemcpy(d_A, A, N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, N * N * sizeof(int), cudaMemcpyHostToDevice);

    // Define the number of threads per block and the number of blocks
    int threadsPerBlock = 2;
    dim3 blocksPerGrid((N + threadsPerBlock - 1) / threadsPerBlock, (N + threadsPerBlock - 1) / threadsPerBlock);

    // Launch the matrix multiplication kernel
    matrixMultiplication<<<blocksPerGrid, dim3(threadsPerBlock, threadsPerBlock)>>>(d_A, d_B, d_C, N);

    // Copy the result from the device to the host
    cudaMemcpy(C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the result
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            std::cout << C[i][j] << " ";
        }
        std::cout << std::endl;
    }

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


"""

In [56]:
text_file = open("assign2.cu", "w")
text_file.write(code2)
text_file.close()

In [57]:
!nvcc assign2.cu -o b

In [58]:
!./b

1 2 3 4 
5 6 7 8 
9 10 11 12 
13 14 15 16 


In [59]:
!nvprof ./b

==8542== NVPROF is profiling process 8542, command: ./b
1 2 3 4 
5 6 7 8 
9 10 11 12 
13 14 15 16 
==8542== Profiling application: ./b
==8542== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   48.48%  5.1200us         1  5.1200us  5.1200us  5.1200us  matrixMultiplication(int const *, int const *, int*, int)
                   31.52%  3.3280us         2  1.6640us  1.4080us  1.9200us  [CUDA memcpy HtoD]
                   20.00%  2.1120us         1  2.1120us  2.1120us  2.1120us  [CUDA memcpy DtoH]
      API calls:   99.82%  215.21ms         3  71.738ms  4.9780us  215.20ms  cudaMalloc
                    0.07%  140.75us         3  46.916us  5.3360us  124.78us  cudaFree
                    0.06%  133.25us       101  1.3190us     136ns  52.252us  cuDeviceGetAttribute
                    0.03%  54.061us         3  18.020us  8.3050us  23.556us  cudaMemcpy
                    0.01%  26.139us         1  26.139us  26.139us  26