# Kernel Tuner Tutorial

## Convolution

In this hands-on we are going to play with all the features of Kernel Tuner.

We start by providing the source code of a convolution algorithm; your goal is to tune this kernel, without modifying the CUDA code.

This particular convolution applies a random `17x17` filter to a `512x512` image, and it has multiple tunable parameters:

* `block_size_x`: the number of threads per block in the `x` dimension
* `block_size_y`: the number of threads per block in the `y` dimension
* `tile_size_x`: the size of the tile in the `x` dimension
* `tile_size_y`: the size of the tile in the `y` dimension
* `use_padding`: a binary flag to disable or enable padding in shared memory
* `read_only`: a binary flag to disable or enable the use of read-only cache

After each section in the tutorial, and after you are done with the corresponing hands-on, you can get back to this notebook and improve the tuning process.

In [None]:
%%writefile convolution.cu

#define image_height 512
#define image_width 512

#define filter_height 17
#define filter_width 17

#define border_height ((filter_height/2)*2)
#define border_width ((filter_width/2)*2)
#define input_height (image_height + border_height)
#define input_width (image_width + border_width)

#ifndef block_size_x
    #define block_size_x 16
#endif
#ifndef block_size_y
    #define block_size_y 16
#endif
#ifndef block_size_z
    #define block_size_z 1
#endif
#ifndef tile_size_x
    #define tile_size_x 1
#endif
#ifndef tile_size_y
    #define tile_size_y 1
#endif

#define i_end min(block_size_y*tile_size_y+border_height, input_height)
#define j_end min(block_size_x*tile_size_x+border_width, input_width)

/*
 * If requested, we can use the __ldg directive to load data through the
 * read-only cache. 
 */
#define USE_READ_ONLY_CACHE read_only
#if USE_READ_ONLY_CACHE == 1
#define LDG(x, y) __ldg(x+y)
#elif USE_READ_ONLY_CACHE == 0
#define LDG(x, y) x[y]
#endif

__constant__ float d_filter[17*17];

/*
 * If use_padding == 1, we introduce (only when necessary) a number of padding
 * columns in shared memory to avoid shared memory bank conflicts
 *
 * padding columns are only inserted when block_size_x is not a multiple of 32 (the assumed number of memory banks)
 * and when the width of the data needed is not a multiple of 32. The latter is because some filter_widths never
 * cause bank conflicts.
 * 
 * If not passed as a tunable parameter, padding is on by default
 */
#define shared_mem_width (block_size_x*tile_size_x+border_width)
#ifndef use_padding
    #define use_padding 1
#endif
#if use_padding == 1
    #if (((block_size_x % 32)!=0) && (((shared_mem_width-block_size_x)%32) != 0))
        // next line uses &31 instead of %32, because % in C is remainder not modulo
        #define padding_columns ((32 - (border_width + block_size_x*tile_size_x - block_size_x)) & 31)
        #undef shared_mem_width
        #define shared_mem_width (block_size_x*tile_size_x+border_width+padding_columns)
    #endif
#endif


__global__ void convolution_kernel(float *output, float *input, float *filter) {
    int ty = threadIdx.y;
    int tx = threadIdx.x;
    int by = blockIdx.y * block_size_y * tile_size_y;
    int bx = blockIdx.x * block_size_x * tile_size_x;

    // shared memory to hold all input data need by this thread block
    __shared__ float sh_input[block_size_y*tile_size_y+border_height][shared_mem_width];

    // load all input data needed by this thread block into shared memory
    #pragma unroll
    for (int i=ty; i<i_end; i+=block_size_y) {
        #pragma unroll
        for (int j=tx; j<j_end; j+=block_size_x) {
            #if ((image_height%(block_size_y*tile_size_y)!=0) || (image_width%(block_size_x*tile_size_x)!=0))
            int y = by+i;
            int x = bx+j;
            if (y < input_height && x < input_width) {
                sh_input[i][j] = LDG(input, y*input_width+x);
            }
            #else
                sh_input[i][j] = LDG(input, (by+i)*input_width + (bx+j));
            #endif
        }
    }
    __syncthreads();

    // thread-local registers to hold local sums
    float sum[tile_size_y][tile_size_x];
    #pragma unroll
    for (int yi=0; yi<tile_size_y; yi++) {
        #pragma unroll
        for (int xi=0; xi<tile_size_x; xi++) {
             sum[yi][xi] = 0.0f;
        }
    }

    // for each filter weight
    #pragma unroll
    for (int i=0; i < filter_height; i++) {
        #pragma unroll
        for (int j=0; j < filter_width; j++) {

            #pragma unroll
            for (int yi=0; yi<tile_size_y; yi++) {   
                #pragma unroll
                for (int xi=0; xi<tile_size_x; xi++) {
                    sum[yi][xi] += sh_input[ty+yi*block_size_y+i][tx+xi*block_size_x+j] * d_filter[i*filter_width+j];
                }
            }

        }
    }

    // store results to global memory
    #pragma unroll
    for (int yi=0; yi<tile_size_y; yi++) {   
        #pragma unroll
        for (int xi=0; xi<tile_size_x; xi++) {
            #if ((image_height%(block_size_y*tile_size_y)!=0) || (image_width%(block_size_x*tile_size_x)!=0))
            int y = by+ty+yi*block_size_y;
            int x = bx+tx+xi*block_size_x;
            if (y < image_height && x < image_width) {
                output[y * image_width + x] = sum[yi][xi];
            }
            #else
                output[(by+ty+yi*block_size_y) * image_width + bx+tx+xi*block_size_x] = sum[yi][xi];
            #endif
        }
    }

}

Before using Kernel Tuner it is time to install and import it and its dependencies.

In [None]:
%pip install numpy
%pip install pycuda
%pip install kernel_tuner

import numpy as np
import kernel_tuner as kt
import collections

It is now up to you to use Kernel Tuner to find the best performing configuration of the convolution kernel.

In [None]:
# problem sizes
problem_size = (512, 512)
size = np.prod(problem_size)
filter_size = 17
input_size = ((problem_size[0] + filter_size - 1) * (problem_size[1] + filter_size - 1))

# memory allocation
output_image = np.zeros(size).astype(np.float32)
input_image = np.random.randn(input_size).astype(np.float32)
filter_weights = np.random.randn(filter_size * filter_size).astype(np.float32)
cmem_args = {'d_filter': filter_weights}
args = [output_image, input_image, filter_weights]

# tuning parameters
compiler_flags = ["-Wno-deprecated-gpu-targets"]
tune_params = collections.OrderedDict()

# tuning
results, env = kt.tune_kernel("convolution_kernel", "convolution.cu",
        problem_size, args, tune_params,
        cmem_args=cmem_args, compiler_options=compiler_flags,
        verbose=True)