# 4. Zero copy

Zero-copy를 이용하시면, Pinned memory를 사용하는 한편 명시적으로 GPU memory를 할당하고, 복사하는 과정을 생략하고, GPU Kernel에서 Host memory에 접근하는 것과 같은 효과를 가져올 수 있습니다.

In [13]:
%%file sgemm_zero_copy.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <helper_cuda.h>

typedef enum TARGET {HOST, DEVICE} TARGET;

typedef struct {
    int width;
    int height;
    float *elements;
} Matrix;

__global__ void sgemm(Matrix A, Matrix B, Matrix C, 
                      const float alpha, const float beta, 
                      const int width, const int height) {
    int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
    int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
    int idx = idx_y * width + idx_x;
    
    if (idx_x >= width || idx_y >= height)
        return;
    
    float value = 0.f;
    for (int e = 0; e < width; e++)
        value += A.elements[idx_y * width + e] * B.elements[e * width + idx_x];
    C.elements[idx] = alpha * value + beta * C.elements[idx];
}

void InitMatrix(Matrix &mat, const int width, const int height, TARGET target = HOST);

int main(int argv, char* argc[]) {
    Matrix A, B, C;
    Matrix dA, dB, dC;
    const float alpha = 2.f;
    const float beta = .5f;
    const int width = 2048;
    const int height = width;
    float elapsed_gpu;
    double elapsed_cpu;
    
    // CUDA Event Create to estimate elased time
    cudaEvent_t start, stop;
    struct timespec begin, finish;
    
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // Initialize host matrix
    InitMatrix(A, width, height);
    InitMatrix(B, width, height);
    InitMatrix(C, width, height);
    
    // CUDA Memory Initialize
    cudaHostGetDevicePointer((void**)&dA.elements, A.elements, 0);
    cudaHostGetDevicePointer((void**)&dB.elements, B.elements, 0);
    cudaHostGetDevicePointer((void**)&dC.elements, C.elements, 0);
    
    // CUDA Operation
    cudaEventRecord(start, 0);
    clock_gettime(CLOCK_MONOTONIC, &begin);
    
    // Copy host data to the device (CUDA global memory) YOU DON'T NEED THIS
    //cudaMemcpy(dA.elements, A.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    //cudaMemcpy(dB.elements, B.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    //cudaMemcpy(dC.elements, C.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    
    // Launch GPU Kernel
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
    sgemm<<<gridDim, blockDim>>>(dA, dB, dC, alpha, beta, width, height);
    cudaEventRecord(stop, 0);
    
    // Copy computation result from the Device the host memory
    //cudaMemcpy(C.elements, dC.elements, width * height * sizeof(float), cudaMemcpyDeviceToHost);
    
    clock_gettime(CLOCK_MONOTONIC, &finish);
    
    // Estimate CUDA operation time
    cudaEventSynchronize(stop);
    cudaDeviceSynchronize();
    
    cudaEventElapsedTime(&elapsed_gpu, start, stop);
    printf("SGEMM CUDA Elapsed time: %f ms\n", elapsed_gpu);
    elapsed_cpu = (finish.tv_sec - begin.tv_sec);
    elapsed_cpu += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    printf("Host time: %f ms\n", elapsed_cpu * 1000);
    
    // finalize CUDA event
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    // Finalize
    //cudaFree(dA.elements);
    //cudaFree(dB.elements);
    //cudaFree(dC.elements);
    
    cudaFreeHost(A.elements);
    cudaFreeHost(B.elements);
    cudaFreeHost(C.elements);
    
    return 0;
}

void InitMatrix(Matrix &mat, const int width, const int height, TARGET target) {
    mat.width = width;
    mat.height = height;
    
    if (target == DEVICE) {
        cudaMalloc((void**)&mat.elements, width * height * sizeof(float));
    }
    else {
        checkCudaErrors(cudaHostAlloc(&mat.elements, width * height * sizeof(float), cudaHostAllocMapped));

        for (int row = 0; row < height; row++) {
            for (int col = 0; col < width; col++) {
                mat.elements[row * width + col] = row * width + col * 0.001;
            }
        }
    }
}

Overwriting sgemm_zero_copy.cu


In [14]:
! make sgemm_zero_copy

nvcc sgemm_zero_copy.cu --ptxas-options=--verbose -gencode=arch=compute_35,code=sm_35 -I/usr/local/cuda/samples/common/inc -o sgemm_zero_copy
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z5sgemm6MatrixS_S_ffii' for 'sm_35'
ptxas info    : Function properties for _Z5sgemm6MatrixS_S_ffii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 17 registers, 384 bytes cmem[0]


In [15]:
! ./sgemm_zero_copy

SGEMM CUDA Elapsed time: 0.020896 ms
Host time: 0.018359 ms


Zero-copy를 사용했을 때, 보다 빠른 수행시간을 보였습니다. 이는 Kernel 실행이 GPU 메모리 복사가 끝나기까지 기다려야하