CUDA 실습

* 기억할 명령어
 - 파이썬 노트북 환경에서 화일을 만들때 사용하는 명령어 => %%writefile [파일이름]
 - 쉘 명령어 사용하는 명령어 => ![명령어]

In [44]:
%%writefile cuda_01.cu

#include <stdio.h>

__device__ void hiDeviceFunction()
{
    printf("Hello! This is in hiDevF\n");
}

__global__ void helloCUDA(void)
{
    printf("Hello thread [%d] in block [%d] \n", threadIdx.x, blockIdx.x);
    printf("Hello! We have %d threads in a block\n", blockDim.x);
    // hiDeviceFunction();
 
    int i = blockIdx.x * blockDim.x + threadIdx.x; // thread 고유아이디 구하는 식
    printf("This is thread with unique ID[%d]\n", i);
}

int main()
{
    helloCUDA<<<1,4>>>(); // <<<num of block, num of thread per block>>>
    cudaDeviceSynchronize(); // helloCUDA 함수가 끝나기를 기다림
 
    // cudaDeviceReset(); -> synchronize + return
    
    return 0;
}

Overwriting cuda_01.cu


In [46]:
!nvcc cuda_01.cu -o cuda_01

In [47]:
!ls

cuda_01  cuda_01.cu  sample_data


In [48]:
!./cuda_01

Hello thread [0] in block [0] 
Hello thread [1] in block [0] 
Hello thread [2] in block [0] 
Hello thread [3] in block [0] 
Hello! We have 4 threads in a block
Hello! We have 4 threads in a block
Hello! We have 4 threads in a block
Hello! We have 4 threads in a block


하나의 스레드는 하나의 streaming processor에서 수행

하나의 블록은 하나의 streaming multiprocessor에서 수행

*GPU Memory 관리

CPU(HOST) manage device (GPU) memory

- cudaMalloc(void** pointer, size_t num_bytes)
- cudaMemset(void** pointer, int value, size_t count)
- cudaFree(void* pointer)

- cudaMemcpy(void* dst, void* src, size_t num_bytes, enum cudaMemcpyKind direction);

direction =>

  1. cudaMemcpyHostToDevice

  2. cudaMemcpyDeviceToHost

  3. cudaMemcpyDeviceToDevice


In [31]:
%%writefile cuda_saxpy.cu
/*
  Example: SAXPY Kernel

  [compute] for(i=0; i<n; i++) y[i] = a*x[i] + y[i]
  Each thread processes on element
*/

#include <stdio.h>

void saxpy_CPU(int n, float a, float* x, float* y)
{
    for(int i=0; i<n; i++) y[i] = a*x[i] + y[i];
}

__global__ void saxpy_GPU(int n, float a, float* x, float* y)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x; 
    // thread 고유아이디(인덱스) 구하는 식
    if(i<n) y[i] = a*x[i] + y[i];
}


int main()
{
    int n = 1024*1024;
    // x and y
    float* x;
    float* y;
    float scalar = 1.0;
 
    float* d_x;
    float* d_y;

    // x and y vector memory allocation in CPU
    x = (float *)malloc(sizeof(float) * n);
    y = (float *)malloc(sizeof(float) * n);
 
    // allocate GPU glabal memory for x and y
    cudaMalloc((void**) &d_x, n*sizeof(float));
    cudaMalloc((void**) &d_y, n*sizeof(float));
 
    // x and y vector memory initialization in CPU
    for (int i = 0; i<n; i++)
    {
        x[i] = 1.0;
        y[i] = 2.0;
    }
 
    cudaMemcpy (d_x, x, n*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy (d_y, y, n*sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int numBlock = n / threadsPerBlock;
 
    saxpy_GPU<<<numBlock, threadsPerBlock>>>(n, scalar, d_x, d_y);
    cudaDeviceSynchronize();

    // saxpy in CPU
    // saxpy_CPU(n, scalar, x, y);
    
    // copy back the result to the CPU memory
    cudaMemcpy(y, d_y, n*sizeof(float), cudaMemcpyDeviceToHost);
 
    for (int i = 0; i<10; i++)
        printf("y[i] : %f \n", y[i]);
 
    cudaFree(d_x);
    cudaFree(d_y);
    free(x);
    free(y);
 
    return 0;
}

Overwriting cuda_saxpy.cu


In [32]:
!ls

cuda_saxpy.cu  sample_data  saxpy


In [33]:
!nvcc cuda_saxpy.cu -o saxpy

In [34]:
!./saxpy

y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 


In [35]:
!nvprof --print-gpu-trace ./saxpy

==545== NVPROF is profiling process 545, command: ./saxpy
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
y[i] : 3.000000 
==545== Profiling application: ./saxpy
==545== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
346.36ms  707.88us                    -               -         -         -         -  4.0000MB  5.5183GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
347.29ms  725.09us                    -               -         -         -         -  4.0000MB  5.3873GB/s    Pageable      Device     Tesla T4 (0)         1         7  [CUDA memcpy HtoD]
348.02ms  44.831us           (4096 1 1)       (256 1 1)        16        0B        0B         -           -           -           -     Tesla T4 (0)      