# Kernel Tuner Tutorial - Energy aware computing

## Hands-on

In [None]:
!wget -O GEMM_A100_cache.json.bz2 https://github.com/KernelTuner/kernel_tuner_tutorial/blob/master/energy/data/GEMM_NVML_NVIDIA_A100-PCIE-40GB_freq_cache.json.bz2?raw=true
!bunzip2 GEMM_A100_cache.json.bz2

In [None]:
#%pip install kernel_tuner
%pip install git+https://github.com/KernelTuner/kernel_tuner.git@energy_tutorial

import numpy as np
import kernel_tuner as kt
import collections

We now allocate memory, define tunable parameters and constraints, and tune the kernel.

In [4]:
tune_params = collections.OrderedDict()
tune_params["nvml_gr_clock"] = [330, 510, 690, 870, 1050, 1230, 1410] # <=== Parameter for clock frequency of GPU

tune_params["MWG"] = [16, 32, 64, 128]
tune_params["NWG"] = [16, 32, 64, 128]
tune_params["KWG"] = [32]
tune_params["MDIMC"] = [8, 16, 32]
tune_params["NDIMC"] = [8, 16, 32]
tune_params["MDIMA"] = [8, 16, 32]
tune_params["NDIMB"] = [8, 16, 32]
tune_params["KWI"] = [2]
tune_params["VWM"] = [1, 2, 4, 8]
tune_params["VWN"] = [1, 2, 4, 8]
tune_params["STRM"] = [0]
tune_params["STRN"] = [0]
tune_params["SA"] = [0, 1]
tune_params["SB"] = [0, 1]
tune_params["PRECISION"] = [32]

# Size of the matrices to test on
m = n = k = 4096
problem_size = (m, n)

grid_div_x = ["MWG"]
grid_div_y = ["NWG"]
block_size_names = ["MDIMC", "NDIMC", "block_size_z"]

# Search space restriction
restrict = []
restrict += ["KWG % KWI == 0"]
restrict += ["MWG % (MDIMC * VWM) == 0"]
restrict += ["NWG % (NDIMC * VWN) == 0"]
restrict += ["MWG % (MDIMA * VWM) == 0"]
restrict += ["NWG % (NDIMB * VWN) == 0"]
restrict += ["KWG % ((MDIMC * NDIMC)/MDIMA) == 0"]
restrict += ["KWG % ((MDIMC * NDIMC)/NDIMB) == 0"]

restrict += ["not (MWG == 128 and NWG == 128 and MDIMC == 8 and NDIMC == 8)"]

def ops(m, n, k):
    return (m * n * k * 2 + 2 * m * k)/1e9

total_flops = ops(m,n,k)
metrics = collections.OrderedDict()
metrics["GFLOP/s"] = lambda p: total_flops / (p["time"] / 1000.0)
metrics["GFLOPS/W"] = lambda p: total_flops / p["nvml_energy"]

# Create NVML observer
#nvmlobserver = kt.nvml.NVMLObserver(["temperature"])

In [8]:
strategy = 'greedy_mls'
fevals = 100
# For speed:
#to_optimize = 'GFLOP/s'

# For energy:
to_optimize = 'GFLOPS/W'

results, env = kt.tune_kernel("Xgemm", "", problem_size, [], tune_params, block_size_names=block_size_names,
                             simulation_mode=True,
                             restrictions=restrict,
                             grid_div_x=grid_div_x,
                             grid_div_y=grid_div_y,
                             strategy=strategy,
                             strategy_options=dict(max_fevals=fevals),
                             metrics=metrics,
                             objective=to_optimize,
                             cache="GEMM_A100_cache.json")

Simulating: NVIDIA A100-PCIE-40GB
nvml_gr_clock=510, MWG=32, NWG=32, KWG=32, MDIMC=8, NDIMC=16, MDIMA=8, NDIMB=8, KWI=2, VWM=4, VWN=2, STRM=0, STRN=0, SA=1, SB=0, PRECISION=32, time=33.917, GFLOP/s=4053.236, GFLOPS/W=34.333
nvml_gr_clock=870, MWG=32, NWG=32, KWG=32, MDIMC=8, NDIMC=16, MDIMA=8, NDIMB=8, KWI=2, VWM=4, VWN=2, STRM=0, STRN=0, SA=1, SB=0, PRECISION=32, time=20.125, GFLOP/s=6831.087, GFLOPS/W=39.314
nvml_gr_clock=870, MWG=32, NWG=32, KWG=32, MDIMC=8, NDIMC=16, MDIMA=8, NDIMB=8, KWI=2, VWM=4, VWN=1, STRM=0, STRN=0, SA=1, SB=0, PRECISION=32, time=20.952, GFLOP/s=6561.276, GFLOPS/W=40.476
nvml_gr_clock=870, MWG=64, NWG=32, KWG=32, MDIMC=8, NDIMC=16, MDIMA=8, NDIMB=8, KWI=2, VWM=4, VWN=1, STRM=0, STRN=0, SA=1, SB=0, PRECISION=32, time=29.737, GFLOP/s=4623.005, GFLOPS/W=41.039
nvml_gr_clock=870, MWG=64, NWG=32, KWG=32, MDIMC=16, NDIMC=16, MDIMA=8, NDIMB=8, KWI=2, VWM=4, VWN=1, STRM=0, STRN=0, SA=1, SB=0, PRECISION=32, time=22.157, GFLOP/s=6204.494, GFLOPS/W=42.114
nvml_gr_clock=8

KeyError: ignored

In [None]:
# matrix width needs to match the value in the kernel source
problem_size = (512, 512)

A = np.random.randn(*problem_size).astype(np.float32)
B = np.random.randn(*problem_size).astype(np.float32)
C = np.zeros_like(A)

args = [C, A, B]

tune_params = collections.OrderedDict()
tune_params["block_size_x"] = [2**i for i in range(0, 11)]
tune_params["block_size_y"] = [2**i for i in range(0, 11)]
tune_params["tile_size_x"] = [2**i for i in range(0, 6)]
tune_params["tile_size_y"] = [2**i for i in range(0, 6)]

restrict = ["block_size_x == block_size_y * tile_size_y"]

grid_div_x = ["block_size_x", "tile_size_x"]
grid_div_y = ["block_size_y", "tile_size_y"]

answer = [np.matmul(A,B), None, None]

metrics = collections.OrderedDict()
metrics["GFLOP/s"] = lambda p : (2 * 512**3 / 1e9) / (p["time"] / 1e3)

In [None]:
results, env = kt.tune_kernel("matmul_kernel", "matmul.cu",
                             problem_size, args, tune_params,
                             grid_div_y=grid_div_y, grid_div_x=grid_div_x,
                             answer=answer, atol=1e-4,
                             restrictions=restrict, verbose=True, iterations=32, metrics=metrics, lang="cupy", cache="matmul_cache.json")
print(f"Number of configurations: {len(results)}")

We can also visualize the tuning results using [KTdashboard](https://github.com/KernelTuner/dashboard).

In [None]:
%pip install git+https://github.com/KernelTuner/dashboard
import panel as pn
pn.extension(comms='colab')
import ktdashboard.ktdashboard as ktd

In [None]:
ktd.KTdashboard("matmul_cache.json").notebook()

There are times when the amount of possible configurations of tunable parameters is too high, or other time constraints do not allow to perform a full search. In those cases, it could be beneficial to use one of Kernel Tuner **search optimization strategies**.

You can experiment with them in the next block. Try different strategies, and compare the optimum found with the overall optimum found previously. You can also time the tuning process to see the differences there.

The strategies and how to enable them are described in Kernel Tuner's [API](https://KernelTuner.github.io/kernel_tuner/stable/user-api.html).

In [None]:
# experiment with enabling a search optimization strategy
strategy = ""

# tell the strategy to compile and benchmark at most 40 kernel configurations
strategy_options = dict(max_fevals=40)

results_opt, env_opt = kt.tune_kernel("matmul_kernel", "matmul.cu",
                                      problem_size, args, tune_params,
                                      grid_div_y=grid_div_y, grid_div_x=grid_div_x,
                                      answer=answer, atol=1e-4,
                                      restrictions=restrict, verbose=True, iterations=32,
                                      metrics=metrics, lang="cupy",
                                      strategy=strategy, strategy_options=strategy_options)
print(f"Number of configurations: {len(results_opt)}")

Next we are going to add a **custom observer** to the kernel. One possibility is to add an observer to compute the number of registers used by the kernel, and add this value to the metrics.

In order to create a new observer it is necessary to extend the class `BenchmarkObserver` provided by Kernel Tuner in the `kt.observers` package. In case you want to access the number of registers used by a kernel instance, this is available inside your observer class in `self.dev.func.num_regs`.

As usual, how to add observers is described in Kernel Tuner's [API](https://KernelTuner.github.io/kernel_tuner/stable/user-api.html).

In [None]:
observers = []

# add a custom observer
from kernel_tuner.observers import BenchmarkObserver

# define your own observer class that extends BenchmarkObserver
#class CustomObserver(BenchmarkObserver)

# implement the get_results method of this class
# ...

# create an instance of your custom observer
custom_observer = None

# append it to the list of observers by uncommenting the line below
#observers.append(custom_observer)

# add a metric so that our observed number of registers appears in the console output
#metrics["regs"] = lambda p:p["num_regs"]


# add an NVMLObserver
from kernel_tuner.nvml import NVMLObserver
nvml_observer = NVMLObserver(["nvml_energy", "temperature"])

observers.append(nvml_observer)

# add metrics to enable console output for observed quantities
metrics["GFLOPS/W"] = lambda p : (2 * 512**3 / 1e9) / (p["nvml_energy"])
metrics["T"] = lambda p:p["temperature"]

# call tune_kernel to tune using our new Observers and additional metrics
results, env = kt.tune_kernel("matmul_kernel", "matmul.cu",
                             problem_size, args, tune_params,
                             observers=observers,
                             grid_div_y=grid_div_y, grid_div_x=grid_div_x,
                             answer=answer, atol=1e-4,
                             restrictions=restrict, verbose=True, iterations=32, metrics=metrics, lang="cupy")
print(f"Number of configurations: {len(results)}")