# Parallel Communication Pattern for CUDA I

* Map
* Gather
* Stencil
* Scatter
* Transpose

## 1. Map

In [None]:
%%file map.cu

#include <stdio.h>

float* get_buffer(int n_size) {
    return (float*)malloc(n_size * sizeof(float));
}

__global__
void d_map(float* d_out, float* d_in, int n_size) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    
    d_out[idx] = d_in[idx];
}

__global__
void d_map_shift(float* d_out, float* d_in, int n_size, int n_shift) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    
    d_out[(n_size + idx + n_shift) % n_size] = d_in[idx];
}

void check_result(float *p_in, float *p_out, int n_size) {
    int compare = 0;
    for (int i = 0; i < n_size; i++) {
        compare += (p_in[i] - p_out[i] != 0) ? 1 : 0;
    }
    printf("Result: %d\n", compare);
}

int main() {
    float *p_in, *p_out;
    float *d_in, *d_out;
    
    int n_size = 65536;
    
    p_in = get_buffer(n_size);
    p_out = get_buffer(n_size);
    
    cudaMalloc((void**)&d_in, n_size * sizeof(float));
    cudaMalloc((void**)&d_out, n_size * sizeof(float));
    
    for (int i = 0; i < n_size; i++) {
        p_in[i] = i;
    }
    cudaMemcpy(d_in, p_in, n_size * sizeof(float), cudaMemcpyHostToDevice);
    
    dim3 blockDim(256);
    dim3 gridDim((n_size + blockDim.x - 1) / blockDim.x);
    d_map<<<gridDim, blockDim>>>(d_out, d_in, n_size);
    
    cudaMemcpy(p_out, d_out, n_size * sizeof(float), cudaMemcpyDeviceToHost);
    
    check_result(p_in, p_out, n_size);
    
    d_map_shift<<<gridDim, blockDim>>>(d_out, d_in, n_size, 2);
    cudaMemcpy(p_out, d_out, n_size * sizeof(float), cudaMemcpyDeviceToHost);
    
    check_result(p_in, p_out, n_size);
    
    cudaFree(d_in);
    cudaFree(d_out);
    
    free(p_in);
    free(p_out);
}

In [None]:
! make map
! ./map

## 2/3. Gather & Stencil

In [None]:
%%file gather.cu

#include <stdio.h>

const int n_size = 1024;

float* get_buffer(int n_size) {
    return (float*)malloc(n_size * sizeof(float));
}

void gather_sum(float* p_out, float* p_in, int n_size, int n_width) {
    for (int j = 0; j < n_size; j++) {
        float sum = 0.0;
        for (int i = 0; i < n_width; i++) {
            int input_idx = j + i;
        
            if (input_idx >= n_size) {
                input_idx %= n_size;
            }

            sum += p_in[input_idx];
        }
        p_out[j] = sum;
    }
}

__global__
void d_gather_sum(float* d_out, float* d_in, int n_size, int n_width) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    
    float sum = 0.0;
    for (int i = 0; i < n_width; i++) {
        int input_idx = idx + i;
        
        if (input_idx >= n_size) {
            input_idx %= n_size;
        }
        
        sum += d_in[input_idx];
    }
    d_out[idx] = sum;
}

void check_result(float *p_out, float *p_out_host, int n_size) {
    float compare = 0.0;
    
    for (int i = 0; i < n_size; i++) {
        compare += (p_out[i] - p_out_host[i] != 0) ? 1 : 0;
    }
    
    printf("Result: %f\n", compare);
}

int main() {
    float *p_in, *p_out, *p_out_host;
    float *d_in, *d_out;
    int n_width = 3;
    
    p_in = get_buffer(n_size);
    p_out = get_buffer(n_size);
    p_out_host = get_buffer(n_size);
    
    cudaMalloc((void**)&d_in, n_size * sizeof(float));
    cudaMalloc((void**)&d_out, n_size * sizeof(float));
    
    for (int i = 0; i < 1024; i++) {
        p_in[i] = i;
    }
    cudaMemcpy(d_in, p_in, n_size * sizeof(float), cudaMemcpyHostToDevice);
    
    gather_sum(p_out_host, p_in, n_size, n_width);
    
    dim3 blockDim(256);
    dim3 gridDim((n_size + blockDim.x - 1) / blockDim.x);
    d_gather_sum<<<gridDim, blockDim>>>(d_out, d_in, n_size, n_width);
    
    cudaMemcpy(p_out, d_out, n_size * sizeof(float), cudaMemcpyDeviceToHost);
    
    check_result(p_out, p_out_host, n_size);
    
    cudaFree(d_in);
    cudaFree(d_out);
    
    free(p_in);
    free(p_out);
    free(p_out_host);
}

In [None]:
! make gather
! ./gather

## 4. Scatter

In [None]:
%%file scatter.cu

#include <stdio.h>

const int n_size = 1024;
const int n_scatter = 3;

float* get_buffer(int n_size) {
    return (float*)malloc(n_size * sizeof(float));
}

void scatter(float* p_out, float* p_in, int n_size, int n_scatter) {
    for (int j = 0; j < n_size; j++) {
        for (int i = 0; i < n_scatter; i++) {
            p_out[i * n_size + j] = j;
        }
    }
}

__global__
void d_scatter(float* d_out, float* d_in, int n_size, int n_scatter) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    
    for (int i = 0; i < n_scatter; i++) {
        d_out[i * n_size + idx] = idx;
    }
}

void check_result(float *p_out, float *p_out_host, int n_size, int n_scatter) {
    float compare = 0.0;
    
    for (int i = 0; i < n_size * n_scatter; i++) {
        compare += (p_out[i] - p_out_host[i] != 0) ? 1 : 0;
    }
    
    printf("Result: %f\n", compare);
}

int main() {
    float *p_in, *p_out, *p_out_host;
    float *d_in, *d_out;
    
    p_in = get_buffer(n_size);
    p_out = get_buffer(n_size * n_scatter);
    p_out_host = get_buffer(n_size * n_scatter);
    
    cudaMalloc((void**)&d_in, n_size * sizeof(float));
    cudaMalloc((void**)&d_out, n_size * n_scatter * sizeof(float));
    
    for (int i = 0; i < n_size; i++) {
        p_in[i] = i;
    }
    cudaMemcpy(d_in, p_in, n_size * sizeof(float), cudaMemcpyHostToDevice);
    
    scatter(p_out_host, p_in, n_size, n_scatter);
    
    dim3 blockDim(256);
    dim3 gridDim((n_size + blockDim.x - 1) / blockDim.x);
    d_scatter<<<gridDim, blockDim>>>(d_out, d_in, n_size, n_scatter);
    
    cudaMemcpy(p_out, d_out, n_size * n_scatter * sizeof(float), cudaMemcpyDeviceToHost);
    
    check_result(p_out, p_out_host, n_size, n_scatter);
    
    cudaFree(d_in);
    cudaFree(d_out);
    
    free(p_in);
    free(p_out);
    free(p_out_host);
}

In [None]:
! make scatter
! ./scatter

## 5. Transpose

In [None]:
%%file transpose.cu

#include <stdio.h>

float* get_buffer(int n_size) {
    return (float*)malloc(n_size * sizeof(float));
}

void check_result(float *p_out, float *p_out_cuda, int n_size) {
    int count = 0;
    
    for (int i = 0; i < n_size; i++) {
        if (p_out[i] - p_out_cuda[i] != 0.0) {
            count++;
        }
    }
    
    printf("Result: %d\n", count);
}

void transpose(float *p_out, float *p_in, int n_width, int n_height) {
    for (int j = 0; j < n_height; j++) {
        for (int i = 0; i < n_width; i++) {
            p_out[i * n_height + j] = p_in[j * n_width + i];
        }
    }
}

__global__ 
void d_transpose(float *d_out, float *d_in, int n_width, int n_height) {
    int idx_x = blockDim.x * blockIdx.x + threadIdx.x;
    int idx_y = blockDim.y * blockIdx.y + threadIdx.y;
    
    if (idx_x < n_width && idx_y < n_height)
        d_out[idx_x * n_height + idx_y] = d_in[idx_y * n_width + idx_x];
}

int main() {
    float *p_in, *p_out, *p_out_cuda;
    float *d_in, *d_out;
    
    int n_width = 1920;
    int n_height = 1080;
    
    p_in = get_buffer(n_width * n_height);
    p_out = get_buffer(n_width * n_height);
    p_out_cuda = get_buffer(n_width * n_height);
    
    // Step 1. Allocate to GPU memory
    cudaMalloc((void**)&d_in, n_width * n_height * sizeof(float));
    cudaMalloc((void**)&d_out, n_width * n_height * sizeof(float));
    
    // Initialize input data
    for (int j = 0; j < n_height; j++) {
        for (int i = 0; i < n_width; i++) {
            p_in[j * n_width + i] = float(j * n_width + i);
        }
    }
    
    // Step 2. Copy to GPU memory
    cudaMemcpy(d_in, p_in, n_width * n_height * sizeof(float), cudaMemcpyHostToDevice);
    
    transpose(p_out, p_in, n_width, n_height);
    
    // Step 3. Kernel leaunch
    dim3 blockDim(16, 16);
    dim3 gridDim((n_width + blockDim.x - 1) / blockDim.x, (n_height + blockDim.y - 1) / blockDim.y);
    d_transpose<<<gridDim, blockDim>>>(d_out, d_in, n_width, n_height);
    
    // Step 4. Copy from GPU
    cudaMemcpy(p_out_cuda, d_out, n_width * n_height * sizeof(float), cudaMemcpyDeviceToHost);
    
    // Step 5. check result
    check_result(p_out, p_out_cuda, n_width * n_height);
    
    // Step 6. free GPU memory
    cudaFree(d_in);
    cudaFree(d_out);
    
    free(p_in);
    free(p_out);
    free(p_out_cuda);
}

In [None]:
! make transpose
! ./transpose