# Kernel Tuner Tutorial - Energy aware computing

## Hands-on

### Getting ready

We start by downloading and installing the dependencies:

In [None]:
# only for non-cached:
#%pip install pycuda
#%pip install pyopencl
#%pip install cupy
#%pip install pynvml

%pip install matplotlib
%pip install seaborn~=0.13.0
%pip install pandas
%pip install git+https://github.com/KernelTuner/kernel_tuner.git@master

!wget -O GEMM_A100_cache.json.bz2 https://github.com/KernelTuner/kernel_tuner_tutorial/blob/master/energy/data/GEMM_NVML_NVIDIA_A100-PCIE-40GB_freq_cache_fake_timings.json.bz2?raw=true
!bunzip2 GEMM_A100_cache.json.bz2

Next, we import the required packages and set our defaults for plotting with Seaborn and Matplotlib:

In [None]:
import kernel_tuner as kt
from copy import deepcopy

# only for non-cached:
# from kernel_tuner.observers.nvml import NVMLObserver

from matplotlib import pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = [15, 8]
sns.set_theme(style="whitegrid", palette="muted")
sns.set_context("paper", rc={"font.size":10,"axes.titlesize":9,"axes.labelsize":12})  
sns.set(font_scale = 1.6)

### Tuning Setup

We start the tuning setup by defining the metrics we will use to measure performance.

In [None]:
# Compute the number of operations that the matrix multiply performs
def ops(m, n, k):
    return (m * n * k * 2 + 2 * m * k) / 1e9

# Size of the matrices
m = n = k = 4096
problem_size = (512, 512)
total_flops = ops(m, n, k)

metrics = dict()
# Throughput
metrics["GFLOP/s"] = lambda p: total_flops / (p["time"] / 1000.0)
# Energy efficiency
metrics["GFLOPS/W"] = lambda p: total_flops / p["nvml_energy"]

Next, we define the parameters we would like to tune, their possible values, and restrictions that apply:

In [None]:
# Tunable parameters
tune_params = dict()
# The nvml_gr_clock is the tunable parameter affecting the GPU frequency in MHz, 690 is closest to the baseclock of 765
tune_params["nvml_gr_clock"] = [330, 510, 690, 870, 1050, 1230, 1410]

tune_params["MWG"] = [16, 32, 64, 128]
tune_params["NWG"] = [16, 32, 64, 128]
tune_params["KWG"] = [32]
tune_params["MDIMC"] = [8, 16, 32]
tune_params["NDIMC"] = [8, 16, 32]
tune_params["MDIMA"] = [8, 16, 32]
tune_params["NDIMB"] = [8, 16, 32]
tune_params["KWI"] = [2]
tune_params["VWM"] = [1, 2, 4, 8]
tune_params["VWN"] = [1, 2, 4, 8]
tune_params["STRM"] = [0]
tune_params["STRN"] = [0]
tune_params["SA"] = [0, 1]
tune_params["SB"] = [0, 1]
tune_params["PRECISION"] = [32]

# Grid size
grid_div_x = ["MWG"]
grid_div_y = ["NWG"]
block_size_names = ["MDIMC", "NDIMC", "block_size_z"]

# Search space restriction
restrict = []
restrict += ["KWG % KWI == 0"]
restrict += ["MWG % (MDIMC * VWM) == 0"]
restrict += ["NWG % (NDIMC * VWN) == 0"]
restrict += ["MWG % (MDIMA * VWM) == 0"]
restrict += ["NWG % (NDIMB * VWN) == 0"]
restrict += ["KWG % ((MDIMC * NDIMC)/MDIMA) == 0"]
restrict += ["KWG % ((MDIMC * NDIMC)/NDIMB) == 0"]
restrict += ["not (MWG == 128 and NWG == 128 and MDIMC == 8 and NDIMC == 8)"]

Next, we setup a Pandas dataframe to keep track of the configurations. 
This is just some boilerplate to easily print and plot configurations later, so feel free to execute the cell without looking into it.

In [None]:
import pandas as pd
from warnings import warn

df = None

def add_to_dataframe(name: str, config: dict):
    global df
    new_config = dict()
    new_config["name"] = name
    # throw out any data that doesn't play nicely in a Pandas dataframe
    for key, value in config.items():
        if not isinstance(value, (list, tuple)):
            new_config[key] = value
    # if the name already exists, overwrite its values
    if df is not None and (df['name'].eq(name)).any():
        warn(f"{name} was already in dataframe, overwriting values")
        index = df.index[df.name == name]
        df.loc[index, list(new_config.keys())] = list(new_config.values())
        return
    # encapsulate each value as a list
    for key, value in config.items():
        new_config[key] = [value]
    # create dataframe
    df_add = pd.DataFrame(new_config)
    # if there is no dataframe, create one
    if df is None:
        df = df_add
        return
    # if not, add it to the existing dataframe
    df = pd.concat([df, df_add], ignore_index=True, sort=False)
    return

We now make a simple function for getting the best configuration from a kernel tuner run:

In [None]:
def get_optimal_config(objective: str, tune_parameters: dict, higher_is_better=True) -> dict:
    _, env_opt = kt.tune_kernel(
        "Xgemm",
        "",
        problem_size,
        [],
        tune_parameters,
        block_size_names=deepcopy(block_size_names),
        simulation_mode=True,
        restrictions=restrict,
        grid_div_x=grid_div_x,
        grid_div_y=grid_div_y,
        strategy="brute_force",
        metrics=metrics,
        objective=objective,
        objective_higher_is_better=higher_is_better,
        cache="GEMM_A100_cache.json",
        quiet=True
    )
    return env_opt['best_config']

### Tuning for Time

We start by simply optimizing for the lowest possible time, and printing the result and the energy it takes in Joule. 

In [None]:
config_race_to_idle_all_clocks = get_optimal_config("time", tune_params, higher_is_better=False)
add_to_dataframe("race-to-idle (global)", config_race_to_idle_all_clocks)
df[['name', 'time', 'nvml_energy']]

### Tuning for Time and Energy

The next step is to use our previous time-optimal configuration, and re-tune only the clock frequencies for energy efficiency. 

In [None]:
def tune_second_phase(baseconfig: dict, tune_key: str, objective: str, higher_is_better=True):
    tune_params_config = deepcopy(tune_params)
    for key, value in baseconfig.items():
        if key != tune_key and key in tune_params_config:
            tune_params_config[key] = [value]
    return get_optimal_config(objective, tune_params_config, higher_is_better)

config_race_to_idle_plus_clocks = tune_second_phase(config_race_to_idle_all_clocks, tune_key='nvml_gr_clock', objective='GFLOPS/W')
add_to_dataframe("race-to-idle + clocks", config_race_to_idle_plus_clocks)

### Tuning for Energy

The final step is to tune for energy efficiency globally. 

In [None]:
config_energy_to_solution_global = get_optimal_config("GFLOPS/W", tune_params)
add_to_dataframe("energy-to-solution (global)", config_energy_to_solution_global)

### Plotting the Results

We can now look at the results in terms of energy efficiency per configuration in the bar chart below:

In [None]:
sns.barplot(y=df.name, x=df.nvml_energy, orient='h', hue=df.name, legend=False)
plt.xlabel('Energy (J), lower is better')
plt.ylabel('')
plt.title('Lowest energy configuration')
plt.tight_layout()

Finally, we can also make a scatterplot to show the relation between energy and time of our three configurations. 
The performance-optimized configuration gives the best performance, but at the cost of almost double the energy of the energy-optimized configuration.
The second configuration provides middle ground, being slower in time, but more energy efficient than global race-to-idle. 

In [None]:
sns.scatterplot(y=df.time, x=df.nvml_energy, hue=df.name, s=250)
plt.xlabel('Energy (J), lower is better')
plt.ylabel('Time, lower is better')
plt.title('Energy versus time')
plt.legend(title='')
plt.tight_layout()