<a href="https://colab.research.google.com/github/Hikachhu/ProgGPU/blob/main/Copie_de_CorrigeTD1ProgGPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Executez cette cellule pour installer la bonne version de CUDA

In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Wed_Apr_11_23:16:29_CDT_2018
Cuda compilation tools, release 9.2, V9.2.88


In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-ntu5kdt6
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-ntu5kdt6
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=23e203e6d5ff68ab91d44b9626d0d47f6923453859404aeb6606070a0e1776da
  Stored in directory: /tmp/pip-ephem-wheel-cache-_qfylp_w/wheels/c5/2b/c0/87008e795a14bbcdfc7c846a00d06981916331eb980b6c8bdf
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


# Exercise 3

In [None]:
%%cu
#include <iostream>
__global__ void kernel(void){
    printf("Hello world from block %d, thread %d\n", blockIdx.x, threadIdx.x);
}

int main(void){
    kernel<<<10,10>>>();
    cudaDeviceSynchronize();
    return 0;
}

Hello world from block 8, thread 0
Hello world from block 8, thread 1
Hello world from block 8, thread 2
Hello world from block 8, thread 3
Hello world from block 8, thread 4
Hello world from block 8, thread 5
Hello world from block 8, thread 6
Hello world from block 8, thread 7
Hello world from block 8, thread 8
Hello world from block 8, thread 9
Hello world from block 6, thread 0
Hello world from block 6, thread 1
Hello world from block 6, thread 2
Hello world from block 6, thread 3
Hello world from block 6, thread 4
Hello world from block 6, thread 5
Hello world from block 6, thread 6
Hello world from block 6, thread 7
Hello world from block 6, thread 8
Hello world from block 6, thread 9
Hello world from block 9, thread 0
Hello world from block 9, thread 1
Hello world from block 9, thread 2
Hello world from block 9, thread 3
Hello world from block 9, thread 4
Hello world from block 9, thread 5
Hello world from block 9, thread 6
Hello world from block 9, thread 7
Hello world from blo

# Exercise 4

In [None]:
%%cu
#include <iostream>
#include <chrono>

#define N 512
#define Niter 1000

void addCPU(int *a, int *b, int *c) {
    for (int i = 0; i < N*N ; i++) {
        c[i] = a[i] + b[i];
    }
}

__global__ void addKernel1D (int *a, int *b, int *c) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < N*N) {
        c[tid] = a[tid] + b[tid];
        tid += blockDim.x * gridDim.x;
    }
}

int main(void) {
    int A[N*N], B[N*N], C[N*N], D[N*N];
    int *d_A, *d_B, *d_C;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaMalloc(&d_A, N*N*sizeof(int));
    cudaMalloc(&d_B, N*N*sizeof(int));
    cudaMalloc(&d_C, N*N*sizeof(int));

    for (int i = 0 ; i < N*N ; i++) {
            A[i] = i;
            B[i] = i+1;
    }
    cudaMemcpy(d_A, A, N*N*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, N*N*sizeof(int), cudaMemcpyHostToDevice);

    auto t_start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i <  Niter ; i++) // We do Niter to have an averaged execution time
      addCPU(A, B, C);
    auto t_end = std::chrono::high_resolution_clock::now();

    cudaDeviceProp properties;
    cudaGetDeviceProperties(&properties, 0);
    int nThreads = properties.maxThreadsPerBlock; // Get maximum number of threads per block
    int nBlocks = (N*N + nThreads - 1) / nThreads; // Get right number of blocks to cover our problem
    addKernel1D<<<nBlocks,nThreads>>>(d_A, d_B, d_C); // WARM-UP
    cudaEventRecord(start);
    for (int i = 0; i <  Niter ; i++) // We do Niter to have an averaged execution time
      addKernel1D<<<nBlocks,nThreads>>>(d_A, d_B, d_C);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float ms;
    cudaEventElapsedTime(&ms, start, stop);

    cudaMemcpy(D, d_C, N*N*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(B, d_B, N*N*sizeof(int), cudaMemcpyDeviceToHost);

    int diff = 0;
    for (int i = 0 ; i < N ; i++) {
        if(D[i] != C[i]) diff = D[i] - C[i];
    }
    if(diff != 0) {
        printf("Wrong computation : diff = %d", diff);
        return 0;
    }
    printf("CPU execution time = %f ms\n", std::chrono::duration<double, std::milli>(t_end-t_start).count()/Niter);
    printf("GPU execution time = %f ms\n", ms/Niter);
    printf("%d %d %d\n",A[2],B[2],C[2]);
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

}

CPU execution time = 0.876779 ms
GPU execution time = 0.027474 ms
2 3 5

