# Parallel Communication Pattern for CUDA I

* Map
* Gather
* Stencil
* Scatter
* Transpose

In [None]:
%%file util.c

float* get_buffer(int n_size) {
    float* buffer = (float*)malloc(n_size * sizeof(float));
    
    time_t t;
    srand((unsigned) time(&t));
    
    for (int i = 0; i < n_size; i++) {
        //buffer[i] = (float)rand()/(float)(RAND_MAX/100);
        buffer[i] = i;
    }
    return buffer;
}

void check_result(float *p_A, float *p_B, int n_size) {
    int compare = 0;
    for (int i = 0; i < n_size; i++) {
        compare += (p_A[i] != p_B[i]) ? 1 : 0;
    }
    printf("Result: %d\n", compare);
}

## 1. Map

In [None]:
%%file map.cu

#include <stdio.h>
#include "util.c"

__global__
void d_map(float* d_out, float* d_in, int n_size) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    
    d_out[idx] = d_in[idx];
}

__global__
void d_map_reverse(float* d_out, float* d_in, int n_size) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    
    d_out[n_size - idx] = d_in[idx];
}

void reverse(float* p_out, float* p_in, int n_size) {
    for (int i = 0; i < n_size; i++) {
        p_out[n_size - i] = p_in[i];
    }
}

int main() {
    float *p_in, *p_out, *p_reverse;
    float *d_in, *d_out;
    
    int n_size = 65536;
    
    p_in = get_buffer(n_size);
    p_out = get_buffer(n_size);
    p_reverse = get_buffer(n_size);
    
    cudaMalloc((void**)&d_in, n_size * sizeof(float));
    cudaMalloc((void**)&d_out, n_size * sizeof(float));
    
    cudaMemcpy(d_in, p_in, n_size * sizeof(float), cudaMemcpyHostToDevice);
    
    dim3 blockDim(256);
    dim3 gridDim((n_size + blockDim.x - 1) / blockDim.x);
    d_map<<<gridDim, blockDim>>>(d_out, d_in, n_size);
    
    cudaMemcpy(p_out, d_out, n_size * sizeof(float), cudaMemcpyDeviceToHost);
    
    check_result(p_in, p_out, n_size);
    
    d_map_reverse<<<gridDim, blockDim>>>(d_out, d_in, n_size);
    cudaMemcpy(p_out, d_out, n_size * sizeof(float), cudaMemcpyDeviceToHost);
    
    reverse(p_reverse, p_in, n_size);
    check_result(p_reverse, p_out, n_size);
    
    cudaFree(d_in);
    cudaFree(d_out);
    
    free(p_in);
    free(p_out);
    free(p_reverse);
}

In [None]:
! make map && ./map

## 2. Gather

In [None]:
%%file gather.cu
#include <stdio.h>
#include "util.c"

const int n_width = 1024;
const int n_height = 1024;

void gather_sum(float* p_out, float* p_in, int n_filter_size, int n_width, int n_height) {
    for (int row = 0; row < n_height; row++) {
        for (int col = 0; col < n_width; col++) {
            float sum = 0.f;
            
            for (int row_filter = 0; row_filter < n_filter_size; row_filter++) {
                for (int col_filter = 0; col_filter < n_filter_size; col_filter++) {
                    int input_idx = n_width * (row + row_filter) + col + col_filter;
                    
                    if ((row + row_filter >= 0 && row + row_filter < n_height) && 
                        (col + col_filter >= 0 && col + col_filter < n_width)) {
                        sum += p_in[input_idx];
                    }
                }
            }
            p_out[row * n_width + col] = sum;
        }
    }
}

__global__
void d_gather_sum(float* p_out, float* p_in, int n_filter_size, int n_width, int n_height) {
    // TODO: Write gather code
}

int main() {
    float *p_in, *p_out, *p_out_host;
    float *d_in, *d_out;
    int n_filter_size = 3;
    int n_size = n_width * n_height;
    
    p_in = get_buffer(n_size);
    p_out = get_buffer(n_size);
    p_out_host = get_buffer(n_size);
    
    cudaMalloc((void**)&d_in, n_size * sizeof(float));
    cudaMalloc((void**)&d_out, n_size * sizeof(float));
  
    cudaMemcpy(d_in, p_in, n_size * sizeof(float), cudaMemcpyHostToDevice);
    
    gather_sum(p_out_host, p_in, n_filter_size, n_width, n_height);
    
    dim3 blockDim(16, 16);
    dim3 gridDim((n_width + blockDim.x - 1) / blockDim.x, (n_height + blockDim.y - 1) / blockDim.y);
    d_gather_sum<<<gridDim, blockDim>>>(d_out, d_in, n_filter_size, n_width, n_height);
    
    cudaMemcpy(p_out, d_out, n_size * sizeof(float), cudaMemcpyDeviceToHost);
    
    check_result(p_out, p_out_host, n_size);
    printf("%d\n", n_size);
    
    cudaFree(d_in);
    cudaFree(d_out);
    
    free(p_in);
    free(p_out);
    free(p_out_host);
}

In [None]:
! make gather && ./gather

## 3. Stencil

In [None]:
%%file stencil.cu
#include <stdio.h>
#include "util.c"

const int n_width = 1024;
const int n_height = 1024;

void stencil_sum(float* p_out, float* p_in, int* p_filter, int n_filter_size, int n_width, int n_height) {
    for (int row = 0; row < n_height; row++) {
        for (int col = 0; col < n_width; col++) {
            float sum = 0.f;
            for (int i = 0; i < n_filter_size; i++) {
                int col_filter = p_filter[i*2 + 0];
                int row_filter = p_filter[i*2 + 1];
                col_filter = 0;
                row_filter = 1;
                int input_idx = n_width * (row + row_filter) + col + col_filter;
                    
                if ((row + row_filter >= 0 && row + row_filter < n_height) && 
                    (col + col_filter >= 0 && col + col_filter < n_width)) {
                    sum += p_in[input_idx];
                }
            }
            p_out[row * n_width + col] = sum;
        }
    }
}

__global__
void d_stencil_sum(float* p_out, float* p_in, int* p_filter, int n_filter_size, int n_width, int n_height) {
    // TODO: Write stencil code
}

int main() {
    float *p_in, *p_out, *p_out_host;
    float *d_in, *d_out;
    int *p_filter, *d_filter;
    int n_filter_size = 5;
    int n_size = n_width * n_height;
    int stencil_filter[5][2] = {{0, -1}, {-1, 0}, {0, 0}, {1, 0}, {0, 1}};
    
    p_in = get_buffer(n_size);
    p_out = get_buffer(n_size);
    p_out_host = get_buffer(n_size);
    p_filter = (int*)get_buffer(n_filter_size * 2);
    
    // Build stencil filter
    memcpy(p_filter, stencil_filter, n_filter_size * 2 * sizeof(int));
        
    cudaMalloc((void**)&d_in, n_size * sizeof(float));
    cudaMalloc((void**)&d_out, n_size * sizeof(float));
    cudaMalloc((void**)&d_filter, n_filter_size * 2 * sizeof(int));
  
    cudaMemcpy(d_in, p_in, n_size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_filter, p_filter, n_filter_size * 2 * sizeof(float), cudaMemcpyHostToDevice);
    
    stencil_sum(p_out_host, p_in, p_filter, n_filter_size, n_width, n_height);
    
    // TODO: Write Kernel Call line
    
    cudaMemcpy(p_out, d_out, n_size * sizeof(float), cudaMemcpyDeviceToHost);
    
    check_result(p_out_host, p_out, n_size);
        
    cudaFree(d_in);
    cudaFree(d_out);
    cudaFree(d_filter);
    
    free(p_in);
    free(p_out);
    free(p_out_host);
    free(p_filter);
}

In [None]:
! make stencil && ./stencil

## 4. Scatter

In [None]:
%%file scatter.cu
#include <stdio.h>
#include "util.c"

const int n_width = 1024;
const int n_height = 1024;

void scatter(float* p_out, float* p_in, int n_width, int n_height) {
    for (int j = 0; j < n_width; j++) {
        for (int i = 0; i < n_height; i++) {
            p_out[i * n_width + j] = j;
        }
    }
}

__global__
void d_scatter_1D(float* d_out, float* d_in, int n_width, int n_height) {
    // TODO: Write scatter code
}

void CallScatter1D(float* d_out, float* d_in, int n_width, int n_height) {
    // TODO: Write Kernal Call using 1D block size
    dim3 blockDim(256);
}

__global__
void d_scatter_2D(float* d_out, float* d_in, int n_width, int n_height) {
    // TODO: Write scatter code
}

void CallScatter2D(float* d_out, float* d_in, int n_width, int n_height) {
    // TODO: Write Kernel Call using 2D block size
    dim3 blockDim(16, 16);
}

int main() {
    float *p_in, *p_out, *p_out_host;
    float *d_in, *d_out;
    
    p_in = get_buffer(n_width);
    p_out = get_buffer(n_width * n_height);
    p_out_host = get_buffer(n_width * n_height);
    
    cudaMalloc((void**)&d_in, n_width * sizeof(float));
    cudaMalloc((void**)&d_out, n_width * n_height * sizeof(float));
    
    cudaMemcpy(d_in, p_in, n_width * sizeof(float), cudaMemcpyHostToDevice);
    
    scatter(p_out_host, p_in, n_width, n_height);
    
    CallScatter1D(d_out, d_in, n_width, n_height);
    cudaMemcpy(p_out, d_out, n_width * n_height * sizeof(float), cudaMemcpyDeviceToHost);
    check_result(p_out_host, p_out, n_width * n_height);
    
    CallScatter2D(d_out, d_in, n_width, n_height);
    cudaMemcpy(p_out, d_out, n_width * n_height * sizeof(float), cudaMemcpyDeviceToHost);
    check_result(p_out_host, p_out, n_width * n_height);
    
    cudaFree(d_in);
    cudaFree(d_out);
    
    free(p_in);
    free(p_out);
    free(p_out_host);
}

In [None]:
! make scatter && ./scatter

## 5. Transpose

In [None]:
%%file transpose.cu
#include <stdio.h>
#include "util.c"

void transpose(float *p_out, float *p_in, int n_width, int n_height) {
    for (int j = 0; j < n_height; j++) {
        for (int i = 0; i < n_width; i++) {
            p_out[i * n_height + j] = p_in[j * n_width + i];
        }
    }
}

__global__ 
void d_transpose(float *d_out, float *d_in, int n_width, int n_height) {
    // TODO: Write transpose code
}

int main() {
    float *p_in, *p_out, *p_out_cuda;
    float *d_in, *d_out;
    
    int n_width = 1920;
    int n_height = 1080;
    
    p_in = get_buffer(n_width * n_height);
    p_out = get_buffer(n_width * n_height);
    p_out_cuda = get_buffer(n_width * n_height);
    
    // Step 1. Allocate to GPU memory
    cudaMalloc((void**)&d_in, n_width * n_height * sizeof(float));
    cudaMalloc((void**)&d_out, n_width * n_height * sizeof(float));
    
    // Initialize input data
    for (int j = 0; j < n_height; j++) {
        for (int i = 0; i < n_width; i++) {
            p_in[j * n_width + i] = float(j * n_width + i);
        }
    }
    
    // Step 2. Copy to GPU memory
    cudaMemcpy(d_in, p_in, n_width * n_height * sizeof(float), cudaMemcpyHostToDevice);
    
    transpose(p_out, p_in, n_width, n_height);
    
    // Step 3. Kernel leaunch
    dim3 blockDim(16, 16);
    dim3 gridDim((n_width + blockDim.x - 1) / blockDim.x, (n_height + blockDim.y - 1) / blockDim.y);
    d_transpose<<<gridDim, blockDim>>>(d_out, d_in, n_width, n_height);
    
    // Step 4. Copy from GPU
    cudaMemcpy(p_out_cuda, d_out, n_width * n_height * sizeof(float), cudaMemcpyDeviceToHost);
    
    // Step 5. check result
    check_result(p_out, p_out_cuda, n_width * n_height);
    
    // Step 6. free GPU memory
    cudaFree(d_in);
    cudaFree(d_out);
    
    free(p_in);
    free(p_out);
    free(p_out_cuda);
}

In [None]:
! make transpose && ./transpose