# Optimization
## 2. Pinned Memory

Pinned Memory는 DMA Operation을 통해 데이터를 전송할 수 있도록 해주는 메모리 공간을 말합니다. CPU/GPU 데이터 전송에서 CPU가 동작하지 않으므로 점유율이 낮고, PCI Express의 대역폭을 최대한 사용할 수 있다는 장점이 있습니다.

In [1]:
%%file sgemm_pinned_memory.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <helper_cuda.h>

typedef enum TARGET {HOST, DEVICE} TARGET;

typedef struct {
    int width;
    int height;
    float *elements;
} Matrix;

__global__ void sgemm(Matrix A, Matrix B, Matrix C, 
                      const float alpha, const float beta, 
                      const int width, const int height) {
    int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
    int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
    int idx = idx_y * width + idx_x;
    
    if (idx_x >= width || idx_y >= height)
        return;
    
    float value = 0.f;
    for (int e = 0; e < width; e++)
        value += A.elements[idx_y * width + e] * B.elements[e * width + idx_x];
    C.elements[idx] = alpha * value + beta * C.elements[idx];
}

void InitMatrix(Matrix &mat, const int width, const int height, TARGET target = HOST);

int main(int argv, char* argc[]) {
    Matrix A, B, C_host, C_device;
    Matrix dA, dB, dC;
    const float alpha = 2.f;
    const float beta = .5f;
    const int width = 2048;
    const int height = 2048;
    float elapsed_gpu;
    double elapsed_cpu;
    
    // CUDA Event Create to estimate elased time
    cudaEvent_t start, stop;
    struct timespec begin, finish;
    
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // Initialize host matrix
    InitMatrix(A, width, height);
    InitMatrix(B, width, height);
    InitMatrix(C_device, width, height);
    
    // CUDA Memory Initialize
    InitMatrix(dA, width, height, DEVICE);
    InitMatrix(dB, width, height, DEVICE);
    InitMatrix(dC, width, height, DEVICE);
    
    // CUDA Operation
    cudaEventRecord(start, 0);
    clock_gettime(CLOCK_MONOTONIC, &begin);
    
    // Copy host data to the device (CUDA global memory)
    cudaMemcpy(dA.elements, A.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dB.elements, B.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dC.elements, C_device.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    
    // Launch GPU Kernel
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
    sgemm<<<gridDim, blockDim>>>(dA, dB, dC, alpha, beta, width, height);
    
    // Copy computation result from the Device the host memory
    cudaMemcpy(C_device.elements, dC.elements, width * height * sizeof(float), cudaMemcpyDeviceToHost);
    
    clock_gettime(CLOCK_MONOTONIC, &finish);
    cudaEventRecord(stop, 0);
    cudaDeviceSynchronize();
    
    // Estimate CUDA operation time
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    
    cudaEventElapsedTime(&elapsed_gpu, start, stop);
    printf("SGEMM CUDA Elapsed time: %f ms\n", elapsed_gpu);
    elapsed_cpu = (finish.tv_sec - begin.tv_sec);
    elapsed_cpu += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    printf("Host time: %f ms\n", elapsed_cpu * 1000);
    
    // finalize CUDA event
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    // Finalize
    cudaFree(dA.elements);
    cudaFree(dB.elements);
    cudaFree(dC.elements);
    
    cudaFreeHost(A.elements);
    cudaFreeHost(B.elements);
    cudaFreeHost(C_device.elements);
    
    return 0;
}

void InitMatrix(Matrix &mat, const int width, const int height, TARGET target) {
    mat.width = width;
    mat.height = height;
    
    if (target == DEVICE) {
        cudaMalloc((void**)&mat.elements, width * height * sizeof(float));
    }
    else {
        checkCudaErrors(cudaHostAlloc(&mat.elements, width * height * sizeof(float), cudaHostAllocDefault));

        for (int row = 0; row < height; row++) {
            for (int col = 0; col < width; col++) {
                mat.elements[row * width + col] = row * width + col * 0.001;
            }
        }
    }
}

Overwriting sgemm_pinned_memory.cu


In [2]:
! make sgemm_pinned_memory

nvcc sgemm_pinned_memory.cu --ptxas-options=--verbose -gencode=arch=compute_35,code=sm_35 -I/usr/local/cuda/samples/common/inc -o sgemm_pinned_memory


ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z5sgemm6MatrixS_S_ffii' for 'sm_35'
ptxas info    : Function properties for _Z5sgemm6MatrixS_S_ffii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 17 registers, 384 bytes cmem[0]


In [3]:
! ./sgemm_pinned_memory

SGEMM CUDA Elapsed time: 6.507008 ms
Host time: 6.494357 ms
