# CUDA Optimization

이번 Tutorial에서는 CUDA 최적화에 대하여 다뤄보도록 하겠습니다.

CUDA 프로그래밍에 대한 최적화 기법은 다음과 같이 정리할 수 있습니다.
* 병렬처리 최적화
    * CUDA Occupancy 최적화
    * Asynchronous Operation
* 대역폭 최적화
    * CPU  GPU 데이터 전송 최적화
    * GPU 메모리 최적화
* 분기 최적화
* 명령어 최적화

하나씩 살펴 보면서, 성능이 어떻게 향상되어 가는지 살펴보도록 하겠습니다.

# 1. SGEMM Example & Occupancy Calculator

최적화를 익히기 위한 예제인 SGEMM을 살펴보도록 하겠습니다.

SGEMM은 다음과 같이 Matrix Multiplication + Summation을 말합니다.
$$ c_{ij} = \alpha \sum _{k=1} ^m a_{ik} b_{kj} + \beta c_{ij} \space\space for \space\space C \leftarrow \alpha A B + \beta C$$

이 코드에는 sgemm 코드와 함께 수행시간을 측정하기 위한 코드가 포함되어 있습니다.

In [91]:
%%file sgemm.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

typedef enum TARGET {HOST, DEVICE} TARGET;

typedef struct {
    int width;
    int height;
    float *elements;
} Matrix;

__global__ void sgemm(Matrix A, Matrix B, Matrix C, 
                      const float alpha, const float beta, 
                      const int width, const int height) {
    int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
    int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
    
    if (idx_x >= width || idx_y >= height)
        return;
    
    // TODO: Write sgemm code "C = alpha * A * B + beta * C"
    float value = 0.f;
    for (int i = 0; i < width; i++)
        value += value + A.elements[idx_y * width + i] * B.elements[i * width + idx_x];
    C.elements[idx_y * width + idx_x] = alpha * value + beta * C.elements[idx_y * width + idx_x];
    printf("%f (%d, %d)\n", alpha * value + beta * C.elements[idx_y * width + idx_x], idx_y, idx_x);
    /////////////
}

void InitMatrix(Matrix &mat, const int width, const int height, TARGET target = HOST);
bool IsMatDiff(Matrix &A, Matrix &B);
void sgemm_host(Matrix &A, Matrix &B, Matrix &C,
               const float alpha, const float beta,
               const int width, const int height);

int main(int argv, char* argc[]) {
    Matrix A, B, C_host, C_device;
    Matrix dA, dB, dC;
    const float alpha = 2.f;
    const float beta = .5f;
    const int width = 4;
    const int height = width;
    float elapsed_gpu;
    double elapsed_cpu;
    
    // CUDA Event Create to estimate elased time
    cudaEvent_t start, stop;
    struct timespec begin, finish;
    
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // Initialize host matrix
    InitMatrix(A, width, height);
    InitMatrix(B, width, height);
    InitMatrix(C_host, width, height);
    InitMatrix(C_device, width, height);

    // CUDA Memory Initialize
    InitMatrix(dA, width, height, DEVICE);
    InitMatrix(dB, width, height, DEVICE);
    InitMatrix(dC, width, height, DEVICE);
    
    // CUDA Operation
    cudaEventRecord(start, 0);
    clock_gettime(CLOCK_MONOTONIC, &begin);
    
    //////////////////////
    // Copy host data to the device (CUDA global memory)
    // TODO: Write CUDA Memcpy code (cpu -> gpu)
    cudaMemcpy(dA.elements, A.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dB.elements, B.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dC.elements, C_device.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    
    // Launch GPU Kernel
    // TODO: Write sgemm Kernel Execution Code
    // Please refer kernel code above.
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
    sgemm<<<gridDim, blockDim>>>(dA, dB, dC, alpha, beta, width, height);
    
    // Copy computation result from the Device the host memory
    // TODO: Write CUDA memcpy code (gpu -> cpu)
    cudaMemcpy(C_device.elements, dC.elements, width * height * sizeof(float), cudaMemcpyDeviceToHost);
    
    //////////////////////
    clock_gettime(CLOCK_MONOTONIC, &finish);
    cudaEventRecord(stop, 0);
    
    // Estimate CUDA operation time
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    
    cudaEventElapsedTime(&elapsed_gpu, start, stop);
    printf("SGEMM CUDA Elapsed time: %f ms\n", elapsed_gpu);
    elapsed_cpu = (finish.tv_sec - begin.tv_sec);
    elapsed_cpu += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    printf("Host time: %f ms\n", elapsed_cpu * 1000);
    
    // Compute CPU Operation
    clock_gettime(CLOCK_MONOTONIC, &begin);
    sgemm_host(A, B, C_host, alpha, beta, width, height);
    clock_gettime(CLOCK_MONOTONIC, &finish);
    
    elapsed_cpu = (finish.tv_sec - begin.tv_sec);
    elapsed_cpu += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    printf("SGEMM CPU only time: %f ms\n", elapsed_cpu * 1000);
    
    if (IsMatDiff(C_host, C_device)) {
        printf("Something wrong!!\n");
    }
    else {
        printf("Success !!\n");
    }
    
    // finalize CUDA event
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    // Finalize
    cudaFree(dA.elements);
    cudaFree(dB.elements);
    cudaFree(dC.elements);
    
    free(A.elements);
    free(B.elements);
    free(C_host.elements);
    free(C_device.elements);
    
    return 0;
}

void InitMatrix(Matrix &mat, const int width, const int height, TARGET target) {
    mat.width = width;
    mat.height = height;
    
    if (target == DEVICE) {
        cudaMalloc((void**)&mat.elements, width * height * sizeof(float));
    }
    else {
        mat.elements = (float*)malloc(width * height * sizeof(float));
    
        for (int row = 0; row < height; row++) {
            for (int col = 0; col < width; col++) {
                mat.elements[row * width + col] = row * width + col * 0.001;
            }
        }
    }
}

bool IsMatDiff(Matrix &A, Matrix &B) {
    if (A.width != B.width || A.height != B.height) {
        return true;
    }
    
    unsigned int count = 0;
    for (int row = 0; row < A.height; row++) {
        for (int col = 0; col < A.width; col++) {
            //count += (A.elements[row * A.width + col] - B.elements[row * A.width + col]) * \
            //    (A.elements[row * A.width + col] - B.elements[row * A.width + col]);
            count += A.elements[row * A.width + col] != B.elements[row * A.width + col] ? 1 : 0;
        }
    }
    
    printf("%d\n", count);
    if (count != 0.f) {
        
        return true;
    }
    return false;
}

void sgemm_host(Matrix &A, Matrix &B, Matrix &C, const float alpha, const float beta, const int width, const int height) {
    for (int row = 0; row < C.height; row++) {
        for (int col = 0; col < C.width; col++) {
            float value = 0.f;
            for (int e = 0; e < C.width; e++)
                value += A.elements[row * width + e] * B.elements[e * width + col];
            C.elements[row * width + col] = alpha * value + beta * C.elements[row * width + col];
            printf("%f (%d, %d)\n", alpha * value + beta * C.elements[row * width + col], row, col);
        }
    }
}

Overwriting sgemm.cu


In [92]:
! make sgemm

nvcc sgemm.cu -gencode=arch=compute_30,code=sm_30 -o sgemm 


In [93]:
! ./sgemm

0.168000 (0, 0)
0.168268 (0, 1)
0.168536 (0, 2)
0.168804 (0, 3)
289.167969 (1, 0)
289.216278 (1, 1)
289.264526 (1, 2)
289.312775 (1, 3)
578.167969 (2, 0)
578.264282 (2, 1)
578.360535 (2, 2)
578.456787 (2, 3)
867.167969 (3, 0)
867.312256 (3, 1)
867.456543 (3, 2)
867.600830 (3, 3)
SGEMM CUDA Elapsed time: 1.060288 ms
Host time: 1.308397 ms
0.168000 (0, 0)
0.168268 (0, 1)
0.168536 (0, 2)
0.168804 (0, 3)
289.167969 (1, 0)
289.216278 (1, 1)
289.264526 (1, 2)
289.312805 (1, 3)
578.167969 (2, 0)
578.264282 (2, 1)
578.360596 (2, 2)
578.456787 (2, 3)
867.168091 (3, 0)
867.312256 (3, 1)
867.456604 (3, 2)
867.600830 (3, 3)
SGEMM CPU only time: 0.029641 ms
4
Something wrong!!


In [94]:
%%file Makefile

NVCC = nvcc
NVCC_OPTS = --ptxas-options=--verbose -gencode=arch=compute_35,code=sm_35 -I/usr/local/cuda/samples/common/inc

all: sgemm
    
sgemm: sgemm.cu
	$(NVCC) sgemm.cu -gencode=arch=compute_30,code=sm_30 -o sgemm 
    
sgemm_v: 
	$(NVCC) sgemm.cu $(NVCC_OPTS) -o sgemm 
    
sgemm_resize_block: sgemm_resize_block.cu
	$(NVCC) sgemm_resize_block.cu $(NVCC_OPTS) -o sgemm_resize_block
    
sgemm_async_copy: sgemm_async_copy.cu
	$(NVCC) sgemm_async_copy.cu $(NVCC_OPTS) -o sgemm_async_copy
    
sgemm_pinned_memory: sgemm_pinned_memory.cu
	$(NVCC) sgemm_pinned_memory.cu $(NVCC_OPTS) -o sgemm_pinned_memory

sgemm_stream: sgemm_stream.cu
	$(NVCC) sgemm_stream.cu $(NVCC_OPTS) -o sgemm_stream


Overwriting Makefile


## CUDA Event
CUDA Kernel의 전체 수행시간을 측정하기 위해서 CUDA Event를 사용했습니다.
CUDA Event는 CPU programming에서 사용하는 Event handle과 같이 다양하게 활용할 수 있습니다. 하지만 이번 시간에는 CUDA의 성능향상을 확인하기 위한 용도로서 CUDA Kernel의 수행시간을 측정하는 부분에만 집중하도록 하겠습니다.

In [123]:
%%file cudaEvent.hold
// CUDA Event Create to estimate elased time
cudaEvent_t start, stop;
cudaeventRecord(start, 0);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);

cudaEventElapsedTime(&elapsed_gpu, start, stop);
printf("SGEMM CUDA Elapsed time: %f ms\n", elapsed_gpu);

Overwriting cudaEvent.hold


## CUDA Occupancy Calculator

![](./CUDA Occupancy Calculator.png)
CUDA Occupancy Calculator는 CUDA SM 상에서 자원을 효율적으로 활용하기 위해 필요한 이론적 계산치를 계산하는데 유용한 엑셀파일입니다. [다운로드](./CUDA_Occupancy_calculator.xls)

여기에 입력할 정보는 대부분은 CUDA 코드를 작성하시면서 설계하는 값이지만, CUDA thread당 register의 수는 compiler에서 사용한 결과를 확인할 수 밖에 없습니다. 따라서 nvcc에게 다음과 같은 option을 주어서 CUDA thread 당 register 수를 확인할 수 있습니다.

**--ptxas-options=--verbose**

Compile 옵션을 주고 나온 결과를 확인해 보겠습니다.

In [124]:
! make sgemm_v

nvcc sgemm.cu --ptxas-options=--verbose -gencode=arch=compute_35,code=sm_35 -o sgemm 
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z5sgemm6MatrixS_S_ffii' for 'sm_35'
ptxas info    : Function properties for _Z5sgemm6MatrixS_S_ffii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 13 registers, 384 bytes cmem[0]


그렇다면 이 값을 가지고 Occupancy Calculator를 통해 어느정도 최적화 되어있는지 확인해봅시다.
그리고 다른 크기로 바꾸어서 성능이 어떻게 바뀌는지 확인해보세요. CPU 코드는 느려서 지웠습니다. :)

In [1]:
%%file sgemm_resize_block.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

typedef enum TARGET {HOST, DEVICE} TARGET;

typedef struct {
    int width;
    int height;
    float *elements;
} Matrix;

__global__ void sgemm(Matrix A, Matrix B, Matrix C, 
                      const float alpha, const float beta, 
                      const int width, const int height) {
    int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
    int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
    int idx = idx_y * width + idx_x;
    
    if (idx_x >= width || idx_y >= height)
        return;
    
    // TODO: Copy sgemm code from above you write
}

void InitMatrix(Matrix &mat, const int width, const int height, TARGET target = HOST);

int main(int argv, char* argc[]) {
    Matrix A, B, C_host, C_device;
    Matrix dA, dB, dC;
    const float alpha = 2.f;
    const float beta = .5f;
    const int width = 2048;
    const int height = 2048;
    float elapsed_gpu;
    double elapsed_cpu;
    
    // CUDA Event Create to estimate elased time
    cudaEvent_t start, stop;
    struct timespec begin, finish;
    
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // Initialize host matrix
    InitMatrix(A, width, height);
    InitMatrix(B, width, height);
    InitMatrix(C_device, width, height);

    // CUDA Memory Initialize
    InitMatrix(dA, width, height, DEVICE);
    InitMatrix(dB, width, height, DEVICE);
    InitMatrix(dC, width, height, DEVICE);
    
    // CUDA Operation
    cudaEventRecord(start, 0);
    clock_gettime(CLOCK_MONOTONIC, &begin);
    
    // Copy host data to the device (CUDA global memory)
    cudaMemcpy(dA.elements, A.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dB.elements, B.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dC.elements, C_device.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    
    // Launch GPU Kernel
    // TODO: Defic
    
    dim3 blockDim(8, 8);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
    sgemm<<<gridDim, blockDim>>>(dA, dB, dC, alpha, beta, width, height);
    
    // Copy computation result from the Device the host memory
    cudaMemcpy(C_device.elements, dC.elements, width * height * sizeof(float), cudaMemcpyDeviceToHost);
    clock_gettime(CLOCK_MONOTONIC, &finish);
    cudaEventRecord(stop, 0);
    
    // Estimate CUDA operation time
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    
    cudaEventElapsedTime(&elapsed_gpu, start, stop);
    printf("SGEMM CUDA Elapsed time: %f ms\n", elapsed_gpu);
    elapsed_cpu = (finish.tv_sec - begin.tv_sec);
    elapsed_cpu += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    printf("Host time: %f ms\n", elapsed_cpu * 1000);
    
    // finalize CUDA event
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    // Finalize
    cudaFree(dA.elements);
    cudaFree(dB.elements);
    cudaFree(dC.elements);
    
    free(A.elements);
    free(B.elements);
    //free(C_host.elements);
    free(C_device.elements);
    
    return 0;
}

void InitMatrix(Matrix &mat, const int width, const int height, TARGET target) {
    mat.width = width;
    mat.height = height;
    
    if (target == DEVICE) {
        cudaMalloc((void**)&mat.elements, width * height * sizeof(float));
    }
    else {
        mat.elements = (float*)malloc(width * height * sizeof(float));
    
        for (int row = 0; row < height; row++) {
            for (int col = 0; col < width; col++) {
                mat.elements[row * width + col] = row * width + col * 0.001;
            }
        }
    }
}


Overwriting sgemm_resize_block.cu


In [61]:
! make sgemm_resize_block

nvcc sgemm_resize_block.cu --ptxas-options=--verbose -o sgemm_resize_block
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z5sgemm6MatrixS_S_ffii' for 'sm_20'
ptxas info    : Function properties for _Z5sgemm6MatrixS_S_ffii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 96 bytes cmem[0]


In [62]:
! ./sgemm_resize_block

SGEMM CUDA Elapsed time: 49.629311 ms
Host time: 49.621832 ms


보시는 바와 같이 CUDA Occupancy에 따른 최적화 정도에 따라 CUDA 병렬화의 정도가 변하고, 성능과 밀접한 연관성이 있음을 확인 할 수 있었습니다.