여기서는 Transpose 동작에 대하여 최적화를 하는 실습을 해보도록 하겠습니다.

# 1. Naïve Transpose
행렬에서 Transpose 동작은 행렬의 요소들을 이항시키는 작업을 의미합니다.

"01. CUDA Basic"에서 Transpose 동작에 대하여 실습을 해 보았습니다.

In [1]:
%%file task1.cu
#include <stdio.h>
#include "util.c"

#define BLOCK_SIZE 16
#define INDEX( row, col, width ) ( ( (row) * (width) ) + (col) )

__global__ 
void d_transpose(float *d_out, float *d_in, int n_width, int n_height) {
    int row = FIXME
    int col = FIXME
    
    // TODO: Write transpose code
    d_out[FIXME] = d_in[FIXME];
}

void transpose(float *p_out, float *p_in, int n_width, int n_height) {
    for (int j = 0; j < n_height; j++) {
        for (int i = 0; i < n_width; i++) {
            p_out[INDEX(i, j, n_height)] = p_in[INDEX(j, i, n_width)];
        }
    }
}

int main() {
    float *p_in, *p_out, *p_out_cuda;
    float *d_in, *d_out;
    
    int n_width = 4096;
    int n_height = 3072;
    
    p_in = get_buffer(n_width * n_height);
    p_out = get_buffer(n_width * n_height);
    p_out_cuda = get_buffer(n_width * n_height);
    
    /* create and start timer */

    cudaEvent_t start, stop;
    float elapsedTime;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );
    
    // Step 1. Allocate to GPU memory
    cudaMalloc((void**)&d_in, n_width * n_height * sizeof(float));
    cudaMalloc((void**)&d_out, n_width * n_height * sizeof(float));
    
    // Initialize input data
    for (int j = 0; j < n_height; j++) {
        for (int i = 0; i < n_width; i++) {
            p_in[j * n_width + i] = float(j * n_width + i);
        }
    }
    
    // Step 2. Copy to GPU memory
    cudaMemcpy(d_in, p_in, n_width * n_height * sizeof(float), cudaMemcpyHostToDevice);
    
    cudaEventRecord( start, 0 );
    printf("%s[%d]\n", __FILE__, __LINE__);
    // CPU transpose
    transpose(p_out, p_in, n_width, n_height);
    printf("%s[%d]\n", __FILE__, __LINE__);
    
    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &elapsedTime, start, stop );
    
    // print CPU performance
    fprintf(stdout, "Total time CPU is %f sec\n", elapsedTime / 1000.0f );
    fprintf(stdout, "Performance is %f GB/s\n", 8.0 * 2.0 * (double) n_width * (double) n_height / 
        ( (double) elapsedTime / 1000.0 ) * 1.e-9 );
    
    // Step 3. Kernel leaunch
    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim(n_width / BLOCK_SIZE + 1, n_height / BLOCK_SIZE + 1);
    
    cudaEventRecord( start, 0 );
    
    d_transpose<<<gridDim, blockDim>>>(d_out, d_in, n_width, n_height);
    
    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &elapsedTime, start, stop );
    
    // print GPU performance
    fprintf(stdout, "Total time GPU is %f sec\n", elapsedTime / 1000.0f );
    fprintf(stdout, "Performance is %f GB/s\n", 8.0 * 2.0 * (double) n_width * (double) n_height / 
        ( (double) elapsedTime / 1000.0 ) * 1.e-9 );
    
    // Step 4. Copy from GPU
    cudaMemcpy(p_out_cuda, d_out, n_width * n_height * sizeof(float), cudaMemcpyDeviceToHost);
    
    // Step 5. check result
    check_result(p_out, p_out_cuda, n_width * n_height);
    
    // Step 6. free GPU memory
    cudaFree(d_in);
    cudaFree(d_out);
    
    free(p_in);
    free(p_out);
    free(p_out_cuda);
}

Overwriting task1.cu


In [2]:
!nvcc -lineinfo -DDEBUG -arch=sm_35 -o task1.out task1.cu && echo Compiled Successfully!

task1.cu(9): error: identifier "FIXME" is undefined

task1.cu(10): error: expected a ";"


2 errors detected in the compilation of "/tmp/tmpxft_00000082_00000000-8_task1.cpp1.ii".


In [3]:
!./task1.out

task1.cu[64]
task1.cu[67]
Total time CPU is 0.021915 sec
Performance is 1.513950 GB/s
Total time CPU is 0.000172 sec
Performance is 193.324628 GB/s
Result: 0


# 2. Shared Memory as Transpose buffer
GPU의 Global memory를 보다 효율적으로 활용하기 위해서는 Shared Memory를 사용하면 된다는 것을 배웠습니다.
Transpose에서 Shared Memory의 효과를 확인해 보도록 하겠습니다.

In [4]:
%%file task2.cu
#include <stdio.h>
#include "util.c"

#define BLOCK_SIZE 16
#define INDEX( row, col, width ) ( ( (row) * (width) ) + (col) )

__global__ 
void d_transpose(float *d_out, float *d_in, int n_width, int n_height) {
    int col = blockDim.x * blockIdx.x + threadIdx.x;
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    
    __shared__ double s_buffer[BLOCK_SIZE][BLOCK_SIZE];
    
    int tileX = blockDim.x * blockIdx.x;
    int tileY = blockDim.y * blockIdx.y;
    
    if (col < n_width && row < n_height)
        s_buffer[threadIdx.y][threadIdx.x] = d_in[INDEX(tileY + threadIdx.y, tileX + threadIdx.x, n_width)];
    __syncthreads();
    
    if (col < n_width && row < n_height)
        d_out[INDEX(tileX + threadIdx.y, tileY + threadIdx.x, n_height)] = s_buffer[threadIdx.x][threadIdx.y];
}

void transpose(float *p_out, float *p_in, int n_width, int n_height) {
    for (int j = 0; j < n_height; j++) {
        for (int i = 0; i < n_width; i++) {
            p_out[INDEX(i, j, n_height)] = p_in[INDEX(j, i, n_width)];
        }
    }
}

int main() {
    float *p_in, *p_out, *p_out_cuda;
    float *d_in, *d_out;
    
    int n_width = 4096;
    int n_height = 3072;
    
    p_in = get_buffer(n_width * n_height);
    p_out = get_buffer(n_width * n_height);
    p_out_cuda = get_buffer(n_width * n_height);
    
    /* create and start timer */

    cudaEvent_t start, stop;
    float elapsedTime;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );
    
    // Step 1. Allocate to GPU memory
    cudaMalloc((void**)&d_in, n_width * n_height * sizeof(float));
    cudaMalloc((void**)&d_out, n_width * n_height * sizeof(float));
    
    // Initialize input data
    for (int j = 0; j < n_height; j++) {
        for (int i = 0; i < n_width; i++) {
            p_in[j * n_width + i] = float(j * n_width);
        }
    }
    
    // Step 2. Copy to GPU memory
    cudaMemcpy(d_in, p_in, n_width * n_height * sizeof(float), cudaMemcpyHostToDevice);
    
    cudaEventRecord( start, 0 );
    
    // CPU transpose
    transpose(p_out, p_in, n_width, n_height);
    
    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &elapsedTime, start, stop );
    
    // print CPU performance
    fprintf(stdout, "Total time CPU is %f sec\n", elapsedTime / 1000.0f );
    fprintf(stdout, "Performance is %f GB/s\n", 8.0 * 2.0 * (double) n_width * (double) n_height / 
        ( (double) elapsedTime / 1000.0 ) * 1.e-9 );
    
    // Step 3. Kernel leaunch
    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim(n_width / BLOCK_SIZE + 1, n_height / BLOCK_SIZE + 1);
    
    cudaEventRecord( start, 0 );
    
    d_transpose<<<gridDim, blockDim>>>(d_out, d_in, n_width, n_height);
    
    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &elapsedTime, start, stop );
    
    // print GPU performance
    fprintf(stdout, "Total time GPU is %f sec\n", elapsedTime / 1000.0f );
    fprintf(stdout, "Performance is %f GB/s\n", 8.0 * 2.0 * (double) n_width * (double) n_height / 
        ( (double) elapsedTime / 1000.0 ) * 1.e-9 );
    
    // Step 4. Copy from GPU
    cudaMemcpy(p_out_cuda, d_out, n_width * n_height * sizeof(float), cudaMemcpyDeviceToHost);
    
    // Step 5. check result
    check_result(p_out, p_out_cuda, n_width * n_height);
    
    // Step 6. free GPU memory
    cudaFree(d_in);
    cudaFree(d_out);
    
    free(p_in);
    free(p_out);
    free(p_out_cuda);
}

Overwriting task2.cu


In [5]:
!nvcc -lineinfo -DDEBUG -arch=sm_35 -o task2.out task2.cu && echo Compiled Successfully!

Compiled Successfully!


In [6]:
!./task2.out

Total time CPU is 0.346841 sec
Performance is 0.580459 GB/s
Total time GPU is 0.000341 sec
Performance is 589.750270 GB/s
Result: 0


# 3. No-Bank Conflict
위 예제를 통해 GPU에 따라 이전과 비교하여 성능에 변화가 생겼습니다.
사용하는 GPU에 따라 성능은 이전보다 빠를 수도 느릴 수도 있습니다. 하지만 어떤 경우라도 사실 이것은 최적의 결과는 아닙니다.
이유는 이전의 경우 Transpose를 하는 과정에서 Shared memory를 사용하는데 bank conflict를 유발하고 있기 때문입니다.

Bank Conflict는 Share memory의 경우 여느 메모리와 마찬가지로 bank로 관리가 됩니다. 만일 복수의 CUDA thread가 동시에 하나의 메모리 bank에 있는 주소에 접근을 하려 하는 경우, memory bank는 순차적으로 처리를 하게 되기에 처리속도에 영향을 주게 됩니다. 이를 피하기 위해서는 shared memory에 접근하는 pattern을 개선해야 할 필요가 있습니다.

가장 간단한 방법으로는 가로 방향으로 1칸의 크기를 더 할당하는 것입니다.
Transpose를 할 때, 처음 shared memory에 데이터를 복사할 때는 각기 다른 bank를 사용하여 bank conflict가 발생하지 않습니다. 하지만 transpose된 형태로 데이터를 읽어 들일때 하나의 bank에 집중되는 현상이 나타나므로, shared memory에 공간을 비틀어서 bank conflict를 우회하는 것입니다.

In [7]:
%%file task3.cu
#include <stdio.h>
#include "util.c"

#define BLOCK_SIZE 16
#define INDEX( row, col, width ) ( ( (row) * (width) ) + (col) )

__global__ 
void d_transpose(float *d_out, float *d_in, int n_width, int n_height) {
    int col = blockDim.x * blockIdx.x + threadIdx.x;
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    
    __shared__ double s_buffer[BLOCK_SIZE][BLOCK_SIZE+1];
    
    int tileX = blockDim.x * blockIdx.x;
    int tileY = blockDim.y * blockIdx.y;
    
    if (col < n_width && row < n_height)
        s_buffer[threadIdx.y][threadIdx.x] = d_in[INDEX(tileY + threadIdx.y, tileX + threadIdx.x, n_width)];
    __syncthreads();
    
    if (col < n_width && row < n_height)
        d_out[INDEX(tileX + threadIdx.y, tileY + threadIdx.x, n_height)] = s_buffer[threadIdx.x][threadIdx.y];
}

void transpose(float *p_out, float *p_in, int n_width, int n_height) {
    for (int j = 0; j < n_height; j++) {
        for (int i = 0; i < n_width; i++) {
            p_out[INDEX(i, j, n_height)] = p_in[INDEX(j, i, n_width)];
        }
    }
}

int main() {
    float *p_in, *p_out, *p_out_cuda;
    float *d_in, *d_out;
    
    int n_width = 4096;
    int n_height = 3072;
    
    p_in = get_buffer(n_width * n_height);
    p_out = get_buffer(n_width * n_height);
    p_out_cuda = get_buffer(n_width * n_height);
    
    /* create and start timer */

    cudaEvent_t start, stop;
    float elapsedTime;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );
    
    // Step 1. Allocate to GPU memory
    cudaMalloc((void**)&d_in, n_width * n_height * sizeof(float));
    cudaMalloc((void**)&d_out, n_width * n_height * sizeof(float));
    
    // Initialize input data
    for (int j = 0; j < n_height; j++) {
        for (int i = 0; i < n_width; i++) {
            p_in[j * n_width + i] = float(j * n_width);
        }
    }
    
    // Step 2. Copy to GPU memory
    cudaMemcpy(d_in, p_in, n_width * n_height * sizeof(float), cudaMemcpyHostToDevice);
    
    cudaEventRecord( start, 0 );
    
    // CPU transpose
    transpose(p_out, p_in, n_width, n_height);
    
    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &elapsedTime, start, stop );
    
    // print CPU performance
    fprintf(stdout, "Total time CPU is %f sec\n", elapsedTime / 1000.0f );
    fprintf(stdout, "Performance is %f GB/s\n", 8.0 * 2.0 * (double) n_width * (double) n_height / 
        ( (double) elapsedTime / 1000.0 ) * 1.e-9 );
    
    // Step 3. Kernel leaunch
    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim(n_width / BLOCK_SIZE + 1, n_height / BLOCK_SIZE + 1);
    
    cudaEventRecord( start, 0 );
    
    d_transpose<<<gridDim, blockDim>>>(d_out, d_in, n_width, n_height);
    
    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &elapsedTime, start, stop );
    
    // print GPU performance
    fprintf(stdout, "Total time GPU is %f sec\n", elapsedTime / 1000.0f );
    fprintf(stdout, "Performance is %f GB/s\n", 8.0 * 2.0 * (double) n_width * (double) n_height / 
        ( (double) elapsedTime / 1000.0 ) * 1.e-9 );
    
    // Step 4. Copy from GPU
    cudaMemcpy(p_out_cuda, d_out, n_width * n_height * sizeof(float), cudaMemcpyDeviceToHost);
    
    // Step 5. check result
    check_result(p_out, p_out_cuda, n_width * n_height);
    
    // Step 6. free GPU memory
    cudaFree(d_in);
    cudaFree(d_out);
    
    free(p_in);
    free(p_out);
    free(p_out_cuda);
}

Overwriting task3.cu


In [8]:
!nvcc -lineinfo -DDEBUG -arch=sm_35 -o task3.out task3.cu && echo Compiled Successfully!

Compiled Successfully!


In [9]:
!./task3.out

Total time CPU is 0.318297 sec
Performance is 0.632512 GB/s
Total time GPU is 0.000270 sec
Performance is 745.256614 GB/s
Result: 0
