# 3. Texture Memory

Texture memory는 GPU의 Texture HW를 이용하여, Index에 대해서 최적화된 동작을 할 수 있도록 해주는 read-only cache memory 입니다.

CUDA의 Texture를 사용하는 방법은 크게 2가지로 나눌 수 있습니다. 하나는 Reference이며, 다른 하나는 Object입니다.
각각의 사용법을 살펴보도록 하겠습니다.

## CUDA reference

In [393]:
%%file sgemm_texture_reference.cu

#include "sgemm.cuh"

texture<float, 1, cudaReadModeElementType> tex_A;
texture<float, 1, cudaReadModeElementType> tex_B;

__global__ void sgemm_texture(Matrix A, Matrix B, Matrix C, 
                      const float alpha, const float beta, 
                      const int width, const int height) {
    int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
    int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
    int idx = idx_y * width + idx_x;
    
    if (idx_x >= width || idx_y >= height)
        return;
    
    float value = 0.f;
    for (int e = 0; e < width; e++)
        value = alpha * tex1Dfetch(tex_A, idx_y * width + e) * tex1Dfetch(tex_B, e * width + idx_x);
    C.elements[idx] = value + beta * C.elements[idx];
}

void launch_sgemm_texture(Matrix &dA, Matrix &dB, Matrix &dC,
                      const float alpha, const float beta, 
                      const int width, const int height) {    
    // Bind the array to the texture reference
    cudaBindTexture(0, tex_A, dA.elements, width * height * sizeof(float));
    cudaBindTexture(0, tex_B, dB.elements, width * height * sizeof(float));
    
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
    sgemm_texture<<<gridDim, blockDim>>>(dA, dB, dC, alpha, beta, width, height);
    
    cudaUnbindTexture(tex_A);
    cudaUnbindTexture(tex_B);
}

Overwriting sgemm_texture_reference.cu


## CUDA object

In [394]:
%%file sgemm_texture_object.cu

#include "sgemm.cuh"

__global__ void sgemm_texture_object(cudaTextureObject_t tex_A, cudaTextureObject_t tex_B, Matrix C, 
                      const float alpha, const float beta, 
                      const int width, const int height) {
    int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
    int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
    int idx = idx_y * width + idx_x;
    
    if (idx_x >= width || idx_y >= height)
        return;
    
    float value = 0.f;
    for (int e = 0; e < width; e++)
        value = alpha * tex1Dfetch<float>(tex_A, idx_y * width + e) * tex1Dfetch<float>(tex_B, e * width + idx_x);
    C.elements[idx] = value + beta * C.elements[idx];
}

void launch_sgemm_texture_object(Matrix &dA, Matrix &dB, Matrix &dC,
                      const float alpha, const float beta, 
                      const int width, const int height) {    
    // create texture object
    cudaResourceDesc resDesc_A, resDesc_B;
    memset(&resDesc_A, 0, sizeof(resDesc_A));
    resDesc_A.resType = cudaResourceTypeLinear;
    resDesc_A.res.linear.devPtr = dA.elements;
    resDesc_A.res.linear.desc.f = cudaChannelFormatKindFloat;
    resDesc_A.res.linear.desc.x = 32; // bits per channel
    resDesc_A.res.linear.sizeInBytes = width * height * sizeof(float);
    
    memcpy(&resDesc_B, &resDesc_A, sizeof(resDesc_A));
    resDesc_B.res.linear.devPtr = dB.elements;

    cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.readMode = cudaReadModeElementType;

    // create texture object: we only have to do this once!
    cudaTextureObject_t tex_A, tex_B;
    cudaCreateTextureObject(&tex_A, &resDesc_A, &texDesc, NULL);
    cudaCreateTextureObject(&tex_B, &resDesc_B, &texDesc, NULL);
    
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
    sgemm_texture_object<<<gridDim, blockDim>>>(tex_B, tex_B, dC, alpha, beta, width, height);
    
    cudaDestroyTextureObject(tex_A);
    cudaDestroyTextureObject(tex_B);
}

Overwriting sgemm_texture_object.cu


## 빌드 및 실행

In [395]:
! make test_texture

nvcc --ptxas-options=--verbose -gencode=arch=compute_35,code=sm_35 -I/usr/local/cuda/samples/common/inc test_texture.cu -c test_texture.o
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z13sgemm_texture6MatrixS_S_ffii' for 'sm_35'
ptxas info    : Function properties for _Z13sgemm_texture6MatrixS_S_ffii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 8 registers, 392 bytes cmem[0], 2 textures
ptxas info    : Compiling entry function '_Z5sgemm6MatrixS_S_ffii' for 'sm_35'
ptxas info    : Function properties for _Z5sgemm6MatrixS_S_ffii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 13 registers, 384 bytes cmem[0]
ptxas info    : Compiling entry function '_Z20sgemm_texture_objectyy6Matrixffii' for 'sm_35'
ptxas info    : Function properties for _Z20sgemm_texture_objectyy6Matrixffii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 8 registers, 368 by

In [396]:
! ./test_texture

texture referecne mode...
SGEMM CUDA Elapsed time (original): 24.311520 ms
SGEMM CUDA Elapsed time (texture): 65.727905 ms
Host time: 99.762920 ms
Success !!


In [386]:
! ./test_texture 1

texture object mode...
SGEMM CUDA Elapsed time (original): 193.555389 ms
SGEMM CUDA Elapsed time (texture): 516.679016 ms
Host time: 748.431495 ms
Success !!


실행결과 오히려 속도가 저하되었습니다.

사실 이 예제에서는 texutre memory의 장점인 interpolation 등을 활용하지 않았기에, Texture memory를 활용하기 위한 overhead만이 측정된 예제라고 할 수 있습니다. 경우에 따라서 texture memory의 특성에 맞는 application이라면 그래도 시도해볼만한 메모리입니다.

In [387]:
%%file test_texture.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "sgemm.cuh"
#include "sgemm.cu"
#include "sgemm_texture_reference.cu"
#include "sgemm_texture_object.cu"

void InitMatrix(Matrix &mat, const int width, const int height, TARGET target = HOST, MEMTYPE memtype = NORMAL);
bool IsMatDiff(Matrix &A, Matrix &B);

int main(int argc, char* argv[]) {
    Matrix A, B, C, D;
    Matrix dA, dB, dC, dD;
    const float alpha = 2.f;
    const float beta = .5f;
    const int width = 2048;
    const int height = width;
    float elapsed_gpu;
    double elapsed_cpu;
    
    // Select Host memory type (NORMAL, PINNED)
    MEMTYPE memtype = PINNED;
    bool texture_reference_mode = true;
    if (argc > 1) {
        if (argv[1]) {
            texture_reference_mode = false;
        }
    }
    
    // CUDA Event Create to estimate elased time
    cudaEvent_t start_org, stop_org, start_opt, stop_opt;
    struct timespec begin, finish;
    
    cudaEventCreate(&start_org);
    cudaEventCreate(&stop_org);
    cudaEventCreate(&start_opt);
    cudaEventCreate(&stop_opt);
    
    // Initialize host matrix
    InitMatrix(A, width, height, HOST, memtype);
    InitMatrix(B, width, height, HOST, memtype);
    InitMatrix(C, width, height, HOST, memtype);
    InitMatrix(D, width, height, HOST, memtype);

    // CUDA Memory Initialize
    InitMatrix(dA, width, height, DEVICE);
    InitMatrix(dB, width, height, DEVICE);
    InitMatrix(dC, width, height, DEVICE);
    InitMatrix(dD, width, height, DEVICE);
    
    // CUDA Operation
    clock_gettime(CLOCK_MONOTONIC, &begin);
    
    // Copy host data to the device (CUDA global memory)
    cudaMemcpyAsync(dA.elements, A.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dB.elements, B.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dC.elements, C.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dD.elements, D.elements, width * height * sizeof(float), cudaMemcpyHostToDevice);
    
    // Launch GPU Kernel
    cudaEventRecord(start_org, 0);
    launch_sgemm(dA, dB, dC, alpha, beta, width, height);
    cudaEventRecord(stop_org, 0);
    cudaEventRecord(start_opt, 0);
    if (texture_reference_mode == true) {
        printf("texture referecne mode...\n");
        launch_sgemm_texture(dA, dB, dD, alpha, beta, width, height);
    } else {
        printf("texture object mode...\n");
        launch_sgemm_texture_object(dA, dB, dD, alpha, beta, width, height);
    }
    cudaEventRecord(stop_opt, 0);
    
    // Copy computation result from the Device the host memory
    cudaMemcpyAsync(C.elements, dC.elements, width * height * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpyAsync(D.elements, dD.elements, width * height * sizeof(float), cudaMemcpyDeviceToHost);
    
    // Estimate CUDA operation time
    cudaEventSynchronize(stop_org);
    cudaEventSynchronize(stop_opt);
    cudaDeviceSynchronize();
    clock_gettime(CLOCK_MONOTONIC, &finish);
    
    cudaEventElapsedTime(&elapsed_gpu, start_org, stop_org);
    printf("SGEMM CUDA Elapsed time (original): %f ms\n", elapsed_gpu);
    cudaEventElapsedTime(&elapsed_gpu, start_opt, stop_opt);
    printf("SGEMM CUDA Elapsed time (texture): %f ms\n", elapsed_gpu);
    elapsed_cpu = (finish.tv_sec - begin.tv_sec);
    elapsed_cpu += (finish.tv_nsec - begin.tv_nsec) / 1000000000.0;
    printf("Host time: %f ms\n", elapsed_cpu * 1000);
    
    if (IsMatDiff(C, D)) {
        printf("Something wrong!!\n");
    }
    else {
        printf("Success !!\n");
    }
    
    // finalize CUDA event
    cudaEventDestroy(start_org);
    cudaEventDestroy(stop_org);
    cudaEventDestroy(start_opt);
    cudaEventDestroy(stop_opt);
    
    // Finalize
    cudaFree(dA.elements);
    cudaFree(dB.elements);
    cudaFree(dC.elements);
    cudaFree(dD.elements);
    
    if (memtype == NORMAL) {
        free(A.elements);
        free(B.elements);
        free(C.elements);
        free(D.elements);
    }
    else {
        cudaFreeHost(A.elements);
        cudaFreeHost(B.elements);
        cudaFreeHost(C.elements);
        cudaFreeHost(D.elements);
    }
    
    return 0;
}

void InitMatrix(Matrix &mat, const int width, const int height, TARGET target, MEMTYPE memtype) {
    mat.width = width;
    mat.height = height;
    
    if (target == DEVICE) {
        cudaMalloc((void**)&mat.elements, width * height * sizeof(float));
    }
    else {
        if (memtype == NORMAL)
            mat.elements = (float*)malloc(width * height * sizeof(float));
        else
            cudaHostAlloc(&mat.elements, width * height * sizeof(float), cudaHostAllocDefault);
    
        for (int row = 0; row < height; row++) {
            for (int col = 0; col < width; col++) {
                mat.elements[row * width + col] = row * width + col * 0.001;
            }
        }
    }
}

bool IsMatDiff(Matrix &A, Matrix &B) {
    if (A.width != B.width || A.height != B.height) {
        return true;
    }
    
    int count = 0;
    for (int row = 0; row < A.height; row++) {
        for (int col = 0; col < A.width; col++) {
            count += (A.elements[row * A.width + col] != B.elements[row * B.width + col]) ? 1 : 0;
            
            if (A.elements[row * A.width + col] != B.elements[row * B.width + col]) {
            printf("%f %f\n", A.elements[row * A.width + col], B.elements[row * B.width + col]);
            break;
        }
        }
    }
    
    if (count != 0) {
        printf("Count: %d\n", count);
        return true;
    }
    return false;
}

Overwriting test_texture.cu
