# GPU Memory Optimization

CUDA에서는 최적의 성능을 제공하기 위해서 다양한 메모리를 제공하고 있습니다.

* L1/L2 Cache
* Shared Memory
* Constant Memory
* Texture Memory
* Global Memory

이번 Tutorial에서는 L1/L2 Cache를 제외한 각각의 메모리가 갖는 특징을 예제를 통하여 살펴보도록 하겠습니다. 또한 Global Memory를 효율적으로 사용하기 위한 방법을 살펴보겠습니다.

## 1.  Shared Memory

Shared Memory는 데이터를 반복적으로 사용하는 데에 있어서 장점을 갖고 있습니다. SGEMM 예제를 통해서 이를 살펴보겠습니다.

Shared Memory를 사용하는 방법은 다음과 같습니다.

$ __shared__ <type> <variable name>[size]; $
    
이렇게 하면 CUDA block 내에서 Shared memory를 이용할 수 있게되며, CUDA Thread를 이용하여 shared memory에 데이터를 복사해 넣고 사용하면 됩니다.

코드작성이 어려우시다면 [Solution](./sgemm_shared_solution.cu) 코드를 살펴보시기 바랍니다.

In [None]:
%%file sgemm_shared.cu

#include "sgemm.cuh"
template <typename T>
__global__ void sgemm_shared(Matrix<T> A, Matrix<T> B, Matrix<T> C, 
                      const T alpha, const T beta, 
                      const int width, const int height) {
    int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
    int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
    
    // TODO: Write shared memory declaration code
    __shared__ T s_A;
    __shared__ T s_B;
    
    // outer box move
    T value = 0;
    for (int step = 0; step < width; step += BLOCK_DIM) {
        // TODO: Write block obtaining Code
        s_A[threadIdx.y][threadIdx.x] = 
                            FIXME < width && FIXME < height ? 
                                A.elements[FIXME] : 0;
        s_B[threadIdx.y][threadIdx.x] = 
                            FIXME < width && FIXME < height ? 
                                B.elements[FIXME] : 0;
        
        // Confirm that all required data is loaded
        FIXME
        
        // inner operation
        for (int e = 0; e < BLOCK_DIM; e++)
            value += s_A[threadIdx.y][e] * s_B[e][threadIdx.x];
    
        // Confirm that all operation is finished above
        FIXME
    }

    // Confirm that interested output only work
    if (idx_x >= width || idx_y >= height)
        return;
    
    // Write the result to device memory
    C.elements[idx_y * width + idx_x] = alpha * value + beta * C.elements[idx_y * width + idx_x];
}

template <typename T>
void launch_sgemm_shared(Matrix<T> &dA, Matrix<T> &dB, Matrix<T> &dC,
                      const T alpha, const T beta, 
                      const int width, const int height) {
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
    sgemm_shared<<<gridDim, blockDim>>>(dA, dB, dC, alpha, beta, width, height);
}

In [None]:
%%file sgemm.cuh

#ifndef _SGEMM_H_
#define _SGEMM_H_

#define BLOCK_DIM 16

typedef enum TARGET {HOST, DEVICE} TARGET;
typedef enum MEMTYPE {NORMAL, PINNED} MEMTYPE;

template <typename T>
struct Matrix {
    int width;
    int height;
    T *elements;
};

#endif /* _SGEMM_H_ */

In [None]:
%%file sgemm.cu

#include "sgemm.cuh"
template <typename T>
__global__ void sgemm(Matrix<T> A, Matrix<T> B, Matrix<T> C, 
                      const T alpha, const T beta, 
                      const int width, const int height) {
    int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
    int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
    
    if (idx_x >= width || idx_y >= height)
        return;
    
    T value = 0;
    for (int e = 0; e < width; e++)
        value += A.elements[idx_y * width + e] * B.elements[e * width + idx_x];
    C.elements[idx_y * width + idx_x] = alpha * value + beta * C.elements[idx_y * width + idx_x];
}

template <typename T>
void launch_sgemm(Matrix<T> &dA, Matrix<T> &dB, Matrix<T> &dC,
                      const T alpha, const T beta, 
                      const int width, const int height) {
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
    sgemm<<<gridDim, blockDim>>>(dA, dB, dC, alpha, beta, width, height);
}

In [8]:
%%file test_shared.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "sgemm.cuh"
#include "sgemm.cu"
#include "sgemm_shared_solution.cu"

template <typename T>
void InitMatrix(Matrix<T> &mat, const int width, const int height, TARGET target = HOST, MEMTYPE memtype = NORMAL);

template <typename T>
bool IsMatDiff(Matrix<T> &A, Matrix<T> &B);

int main(int argv, char* argc[]) {
    Matrix<float> A, B, C, D;
    Matrix<float> dA, dB, dC, dD;
    const float alpha = 2.f;
    const float beta = .5f;
    const int width = 4;
    const int height = width;
    float elapsed_gpu;
    double elapsed_cpu;
    
    // Select Host memory type (NORMAL, PINNED)
    MEMTYPE memtype = PINNED;
    
    // CUDA Event Create to estimate elased time
    cudaEvent_t start_org, stop_org, start_opt, stop_opt;
    struct timespec begin, finish;
    
    cudaEventCreate(&start_org);
    cudaEventCreate(&stop_org);
    cudaEventCreate(&start_opt);
    cudaEventCreate(&stop_opt);
    
    // Initialize host matrix
    InitMatrix(A, width, height, HOST, memtype);
    InitMatrix(B, width, height, HOST, memtype);
    InitMatrix(C, width, height, HOST, memtype);
    InitMatrix(D, width, height, HOST, memtype);

    // CUDA Memory Initialize
    InitMatrix(dA, width, height, DEVICE);
    InitMatrix(dB, width, height, DEVICE);
    InitMatrix(dC, width, height, DEVICE);
    InitMatrix(dD, width, height, DEVICE);
    
    // CUDA Operation
    clock_gettime(CLOCK_MONOTONIC, &begin);
    
    // Copy host data to the device (CUDA global memory)
    cudaMemcpyAsync(dA.elements, A.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dB.elements, B.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dC.elements, C.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dD.elements, D.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    
    // Launch GPU Kernel
    cudaEventRecord(start_org, 0);
    launch_sgemm(dA, dB, dC, alpha, beta, width, height);
    cudaEventRecord(stop_org, 0);
    cudaEventSynchronize(stop_org);
    
    cudaEventRecord(start_opt, 0);
    launch_sgemm_shared(dA, dB, dD, alpha, beta, width, height);
    cudaEventRecord(stop_opt, 0);
    cudaEventSynchronize(stop_opt);
    
    // Copy computation result from the Device the host memory
    cudaMemcpyAsync(C.elements, dC.elements, width * height * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpyAsync(D.elements, dD.elements, width * height * sizeof(float), cudaMemcpyDeviceToHost);
    
    // Estimate CUDA operation time
    cudaDeviceSynchronize();
    clock_gettime(CLOCK_MONOTONIC, &finish);
    
    cudaEventElapsedTime(&elapsed_gpu, start_org, stop_org);
    printf("SGEMM CUDA Elapsed time (original): %f ms\n", elapsed_gpu);
    cudaEventElapsedTime(&elapsed_gpu, start_opt, stop_opt);
    printf("SGEMM CUDA Elapsed time (shared): %f ms\n", elapsed_gpu);
    elapsed_cpu = (finish.tv_sec - begin.tv_sec);
    elapsed_cpu += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    printf("Host time: %f ms\n", elapsed_cpu * 1000);
    
    if (IsMatDiff(C, D)) {
        printf("Something wrong!!\n");
    }
    else {
        printf("Success !!\n");
    }
    
    // finalize CUDA event
    cudaEventDestroy(start_org);
    cudaEventDestroy(stop_org);
    cudaEventDestroy(start_opt);
    cudaEventDestroy(stop_opt);
    
    // Finalize
    cudaFree(dA.elements);
    cudaFree(dB.elements);
    cudaFree(dC.elements);
    cudaFree(dD.elements);
    
    if (memtype == NORMAL) {
        free(A.elements);
        free(B.elements);
        free(C.elements);
        free(D.elements);
    }
    else {
        cudaFreeHost(A.elements);
        cudaFreeHost(B.elements);
        cudaFreeHost(C.elements);
        cudaFreeHost(D.elements);
    }
    
    return 0;
}

template <typename T>
void InitMatrix(Matrix<T> &mat, const int width, const int height, TARGET target, MEMTYPE memtype) {
    mat.width = width;
    mat.height = height;
    
    if (target == DEVICE) {
        cudaMalloc((void**)&mat.elements, width * height * sizeof(T));
    }
    else {
        if (memtype == NORMAL)
            mat.elements = (T*)malloc(width * height * sizeof(T));
        else
            cudaHostAlloc(&mat.elements, width * height * sizeof(T), cudaHostAllocDefault);
        
        for (int row = 0; row < height; row++) {
            for (int col = 0; col < width; col++) {
                mat.elements[row * width + col] = 1.f;//row * width + col * 0.001;
            }
        }
    }
}

template <typename T>
bool IsMatDiff(Matrix<T> &A, Matrix<T> &B) {
    if (A.width != B.width || A.height != B.height) {
        return true;
    }
    
    int count = 0;
    for (int row = 0; row < A.height; row++) {
        for (int col = 0; col < A.width; col++) {
            count |= (A.elements[row * A.width + col] != B.elements[row * A.width + col]) ? 0x1 : 0x0;
            
        }
    }
    
    for (int row = 0; row < A.height; row++) {
        for (int col = 0; col < A.width; col++) {
            printf("%f ", A.elements[row * A.width + col]);
        }
        printf("\n");
    }
    
    for (int row = 0; row < B.height; row++) {
        for (int col = 0; col < B.width; col++) {
            printf("%f ", B.elements[row * B.width + col]);
        }
        printf("\n");
    } 
    
    if (count != 0) {
        printf("Count: %d\n", count);
        return true;
    }
    return false;
}

Overwriting test_shared.cu


In [9]:
! make test_shared

nvcc --ptxas-options=--verbose -gencode arch=compute_30,code=sm_30 -I/usr/local/cuda/samples/common/inc test_shared.cu -c test_shared.o
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z12sgemm_sharedIfEv6MatrixIT_ES2_S2_S1_S1_ii' for 'sm_30'
ptxas info    : Function properties for _Z12sgemm_sharedIfEv6MatrixIT_ES2_S2_S1_S1_ii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 23 registers, 2048 bytes smem, 384 bytes cmem[0]
ptxas info    : Compiling entry function '_Z5sgemmIfEv6MatrixIT_ES2_S2_S1_S1_ii' for 'sm_30'
ptxas info    : Function properties for _Z5sgemmIfEv6MatrixIT_ES2_S2_S1_S1_ii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 17 registers, 384 bytes cmem[0]
nvcc --ptxas-options=--verbose -gencode arch=compute_30,code=sm_30 -I/usr/local/cuda/samples/common/inc test_shared.o -o test_shared


In [10]:
! ./test_shared

SGEMM CUDA Elapsed time (original): 0.019072 ms
SGEMM CUDA Elapsed time (shared): 0.012000 ms
Host time: 0.092196 ms
8.500000 8.500000 8.500000 8.500000 
8.500000 8.500000 8.500000 8.500000 
8.500000 8.500000 8.500000 8.500000 
8.500000 8.500000 8.500000 8.500000 
8.500000 8.500000 8.500000 8.500000 
8.500000 8.500000 8.500000 8.500000 
8.500000 8.500000 8.500000 8.500000 
8.500000 8.500000 8.500000 8.500000 
Success !!


이번 예제에서는 Kernel의 수행 시간만을 측정했습니다.

수행 결과, Shared Memory를 사용하기 전과 후를 비교했을 때, CUDA Kernel의 성능 향상이 있었음을 확인할 수 있었습니다.

** 연습해보기 **
1. sgemm_shared CUDA Kernel에서 사용한 **__syncthread()**를 사용하기 전과 후를 비교해 보세요.
1. Matrix의 크기를 바꾸어가면서 해보세요. 예제 코드에서는 Squared Matrix만이 되도록 되어있습니다만, 수정하시면 다른 형태로도 사용할 수 있습니다.