# SAXPY Example

$$ y = ax + y $$
위 연산에 대하여 벡터 계산

## CPU Version

In [1]:
%%file saxpy_cpu.cc

#include <stdio.h>
#include <stdlib.h>

float* get_vector(int n_size, float seed) {
    // buffer create
    float* p_vector = (float*)malloc(n_size * sizeof(float));
    
    // initialize vector
    for (int i = 0; i < n_size; i++) {
        p_vector[i] = seed * i;
    }
    
    return p_vector;
}

void print_vector(float* p_vector, int n_size) {
    for (int j = 0; j < n_size / 10; j++) {
        for (int i = 0; i < 10; i++) {
            printf("%3.2f ", p_vector[10*j + i]);
        }
        printf("\n");
    }
}

// y = ax + y 연산
void saxpy(float* py, float* px, float alpha, int n_size) {
    for (int i = 0; i < n_size; i++) {
        py[i] = alpha * px[i] + py[i];
    }
}

int main() {
    float *px, *py;
    int n_size = 65536;
    
    px = get_vector(n_size, 0.01);
    py = get_vector(n_size, 0.05);
    
    printf("X\n");
    print_vector(px, 100);
    printf("Y\n");
    print_vector(py, 100);
    
    saxpy(py, px, 2.0, n_size);
    
    printf("saxpy:: y = ax + y\n");
    print_vector(py, 100);
    
    free(px);
    free(py);
    
    return 0;
}


Writing saxpy_cpu.cc


### Compile 및 실행

In [2]:
! make cpu
! ./saxpy_cpu

gcc -Wall -c saxpy_cpu.cc
gcc saxpy_cpu.o -o saxpy_cpu
X
0.00 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 
0.10 0.11 0.12 0.13 0.14 0.15 0.16 0.17 0.18 0.19 
0.20 0.21 0.22 0.23 0.24 0.25 0.26 0.27 0.28 0.29 
0.30 0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 
0.40 0.41 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49 
0.50 0.51 0.52 0.53 0.54 0.55 0.56 0.57 0.58 0.59 
0.60 0.61 0.62 0.63 0.64 0.65 0.66 0.67 0.68 0.69 
0.70 0.71 0.72 0.73 0.74 0.75 0.76 0.77 0.78 0.79 
0.80 0.81 0.82 0.83 0.84 0.85 0.86 0.87 0.88 0.89 
0.90 0.91 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99 
Y
0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 
0.50 0.55 0.60 0.65 0.70 0.75 0.80 0.85 0.90 0.95 
1.00 1.05 1.10 1.15 1.20 1.25 1.30 1.35 1.40 1.45 
1.50 1.55 1.60 1.65 1.70 1.75 1.80 1.85 1.90 1.95 
2.00 2.05 2.10 2.15 2.20 2.25 2.30 2.35 2.40 2.45 
2.50 2.55 2.60 2.65 2.70 2.75 2.80 2.85 2.90 2.95 
3.00 3.05 3.10 3.15 3.20 3.25 3.30 3.35 3.40 3.45 
3.50 3.55 3.60 3.65 3.70 3.75 3.80 3.85 3.90 3.95 
4.00 4.05 4.10 4.15 4.2

## GPU Version

In [3]:
%%file saxpy_gpu.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

float* get_vector(int n_size, float seed = 0.0) {
    // buffer create
    float* p_vector = (float*)malloc(n_size * sizeof(float));
    
    // initialize vector
    if (seed != 0.0) {
        for (int i = 0; i < n_size; i++) {
            p_vector[i] = seed * i;
        }
    }
    
    return p_vector;
}

void check_result(float* py, float* py_cuda, int n_size) {
    float compare = 0.0;
    for (int i = 0; i < n_size; i++) {
        compare += py[i] - py_cuda[i];
    }
    printf("Result: %f\n", compare);
}

// CPU 연산
void saxpy(float* py, float* px, float alpha, int n_size) {
    for (int i = 0; i < n_size; i++) {
        py[i] = alpha * px[i] + py[i];
    }
}

// CUDA Kernel function
__global__ 
void d_saxpy(float* d_y, float* d_x, float alpha, int n_size) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    
    d_y[idx] = alpha * d_x[idx] + d_y[idx];
}

int main() {
    float *px, *py, *py_cuda;
    int n_size = 65536;
    
    px = get_vector(n_size, 0.01);
    py = get_vector(n_size, 0.05);
    py_cuda = get_vector(n_size);
    
    // Step 1. Create GPU memory
    float *d_x, *d_y;
    cudaMalloc((void**)&d_x, n_size * sizeof(float));
    cudaMalloc((void**)&d_y, n_size * sizeof(float));
    
    // Step 2. Copy to GPU memory
    cudaMemcpy(d_x, px, n_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, py, n_size * sizeof(float), cudaMemcpyHostToDevice);
    
    // Step 3. Kernel Call
    saxpy(py, px, 2.0, n_size);
    
    dim3 blockDim(16);
    dim3 gridDim((n_size + blockDim.x - 1) / blockDim.x);
    d_saxpy<<< gridDim, blockDim >>>(d_y, d_x, 2.0, n_size);

    // Step 4. Copy from GPU
    cudaMemcpy(py_cuda, d_y, n_size * sizeof(float), cudaMemcpyDeviceToHost);

    // Step 5. Check Result
    check_result(py, py_cuda, n_size);
    
    // Step 6. Finalize GPU memory
    cudaFree(d_x);
    cudaFree(d_y);
    
    free(px);
    free(py);
    free(py_cuda);
    
    return 0;
}


Writing saxpy_gpu.cu


### Compile 및 실행

In [4]:
! make gpu
! ./saxpy_gpu

nvcc -c saxpy_gpu.cu
nvcc saxpy_gpu.o -o saxpy_gpu
Result: 0.000000


## SAXPY 2D

In [5]:
%%file saxpy_gpu_2d.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

float* get_buffer(int n_size, float seed = 0.0) {
    // buffer create
    float* p_vector = (float*)malloc(n_size * sizeof(float));
    
    // initialize vector
    if (seed != 0.0) {
        for (int i = 0; i < n_size; i++) {
            p_vector[i] = seed * i;
        }
    }
    
    return p_vector;
}

void print_buffer(float* p_buffer, int n_size) {
    for (int j = 0; j < n_size / 10; j++) {
        for (int i = 0; i < 10; i++) {
            printf("%3.2f ", p_buffer[10*j + i]);
        }
        printf("\n");
    }
}

void check_result(float* py, float* py_cuda, int n_width, int n_height) {
    float compare = 0.0;
    for (int j = 0; j < n_width; j++) {
        for (int i = 0; i < n_height; i++) {
            compare += py[j * n_width + i] - py_cuda[j * n_width + i];
        }
    }
    printf("Result: %f\n", compare);
}

/* CPU function */
void saxpy(float* py, float* px, float alpha, int n_width, int n_height) {
    for (int j = 0; j < n_height; j++) {
        for (int i = 0; i < n_width; i++) {
            py[n_height * j + i] = alpha * px[n_height * j + i] + py[n_height * j + i];
        }
    }
}

/* CUDA Kernel function */
__global__ 
void d_saxpy(float* d_y, float* d_x, float alpha, int n_width, int n_height) {
    int idx_x = blockDim.x * blockIdx.x + threadIdx.x; 
    int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
    
    int idx = n_width * idx_y + idx_x;

    d_y[idx] = alpha * d_x[idx] + d_y[idx];
}

int main() {
    float *px, *py, *py_cuda;
    int n_width = 256;
    int n_height = 256;
    
    px = get_buffer(n_width * n_height, 0.01);
    py = get_buffer(n_width * n_height, 0.05);
    py_cuda = get_buffer(n_width * n_height);
        
    // Step 1. Create GPU memory
    float *d_x, *d_y;
    cudaMalloc((void**)&d_x, n_width * n_height * sizeof(float));
    cudaMalloc((void**)&d_y, n_width * n_height * sizeof(float));
    
    // Step 2. Copy to GPU memory
    cudaMemcpy(d_x, px, n_width * n_height * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, py, n_width * n_height * sizeof(float), cudaMemcpyHostToDevice);
    
    // Step 3. Kernel Call
    saxpy(py, px, 2.0, n_width, n_height);
    
    dim3 blockDim(256);
    dim3 gridDim((n_width * n_height + blockDim.x - 1) / blockDim.x);
    d_saxpy<<< gridDim, blockDim >>>(d_y, d_x, 2.0, n_width, n_height);

    // Step 4. Copy from GPU
    cudaMemcpy(py_cuda, d_y, n_width * n_height * sizeof(float), cudaMemcpyDeviceToHost);

    // Step 5. Compare CPU & GPU result
    check_result(py, py_cuda, n_width, n_height);
    
    // Step 6. Finalize GPU memory
    cudaFree(d_x);
    cudaFree(d_y);
    
    free(px);
    free(py);
    free(py_cuda);
    
    return 0;
}


Writing saxpy_gpu_2d.cu


### Compile 및 실행

In [6]:
! make gpu_2d
! ./saxpy_gpu_2d

nvcc -c saxpy_gpu_2d.cu
nvcc saxpy_gpu_2d.o -o saxpy_gpu_2d
Result: 0.000000
