# Load a model from onnx

In [None]:
import onnx
onnx_model = onnx.load("resnet50-v2-7.onnx")

# Prepare Data

In [None]:
from tvm.contrib.download import download_testdata
import numpy as np

img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
img_path = download_testdata(img_url, "imagenet_cat.png", module="data")

# Resize it to 224x224
resized_image = Image.open(img_path).resize((224, 224))
img_data = np.asarray(resized_image).astype("float32")

# Our input image is in HWC layout while ONNX expects CHW input, so convert the array
img_data = np.transpose(img_data, (2, 0, 1))

# Normalize according to the ImageNet input specification
imagenet_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
imagenet_stddev = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
norm_img_data = (img_data / 255 - imagenet_mean) / imagenet_stddev

# Add the batch dimension, as we are expecting 4-dimensional input: NCHW.
img_data = np.expand_dims(norm_img_data, axis=0)

# Compile the ONNX to Relay model

The `relay.frontend.from_onnx` function is called with the ONNX model and the shape dictionary as arguments. It converts the ONNX model to a Relay module (mod) and extracts the parameters (params) from the ONNX model. 

The mod object represents the computation graph and can be further optimized and executed using TVM.

In [None]:
from tvm import relay

# Specify model running target
target = "llvm"

# Specify data format
input_name = "data"
shape_dict = {input_name: img_data.shape}

# converts the ONNX model to a Relay module 
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)


# Compile the Relay module (mod) into an TVM module

With the `tvm.transform.PassContext(opt_level=3)` block, we are creating a `PassContext` object to specify the optimization level for the subsequent operations. The `opt_level=3` indicates a high level of optimization.

Then, we use the `relay.build` function to compile the Relay module (mod) into a TVM module that is an intermediate representation. 

By executing this code snippet, the Relay module will be optimized using the specified optimization level, compiled into a TVM module, and assigned to the lib variable for further usage or deployment.

In [None]:
import tvm
with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, target=target, params=params)


`tvm.device(str(target), 0)` creates a device object using the TVM library's tvm.device() function. The second argument "0" is the device index, which is used when there are multiple devices of the same type (e.g., multiple GPUs).

`graph_executor.GraphModule(lib["default"](dev))` creates a GraphModule object, which is a TVM construct used to execute compiled models. The lib object is likely a pre-compiled TVM module that contains the compiled model, and the "default" key is used to access the default module. The dev object, which represents the target device, is passed to the lib["default"] function to ensure that the module is executed on the correct device.

In [None]:
from tvm.contrib import graph_executor

dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))

# Run the module

`module.get_output(0, tvm.nd.empty(output_shape))` creates an empty TVM NDArray object with shape `output_shape` to store the output result. The first para indicates the index of the output. 

In [None]:
dtype = "float32"
module.set_input(input_name, img_data)
module.run()
output_shape = (1, 1000)
tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy()

# Tune the model

## create a TVM runner

The runner takes compiled code that is generated with a specific set of parameters and measures the performance of it. 
- Parameter `number` specifies the number of times the model inference should be executed for each measurement.
- Parameter `repeat` specifies the number of times the entire measurement process (with number runs) should be repeated.
- Parameter `timeout` sets the timeout (in seconds) for each measurement run.
- Parameter `min_repeat_ms` sets the minimum duration (in milliseconds) for each measurement run. If the number of repeats falls under this time, it will be increased. 
- Parameter `enable_cpu_cache_flush` is specific to CPU-based models. When set to True, it ensures that the CPU cache is flushed before each measurement run, which can help provide more consistent and accurate performance measurements.

In [None]:
from tvm import autotvm

number = 10
repeat = 1
min_repeat_ms = 0  # since we're tuning on a CPU, can be set to 0
timeout = 10  # in seconds

runner = autotvm.LocalRunner(
    number=number,
    repeat=repeat,
    timeout=timeout,
    min_repeat_ms=min_repeat_ms,
    enable_cpu_cache_flush=True,
)

## Create a tuning option

We use an XGBoost algorithim for guiding the search. For a production job, you will want to set the number of trials to be larger than the value of 20 used here. For CPU we recommend 1500, for GPU 3000-4000. 

The `early_stopping` parameter is the minimum number of trails to run before a condition that stops the search early can be applied. 

The measure option indicates where trial code will be built, and where it will be run. In this case, we’re using the `LocalRunner` we just created and a `LocalBuilder`. The `tuning_records` option specifies a file to write the tuning data to.

In [None]:
tuning_option = {
    "tuner": "xgb",
    "trials": 20,
    "early_stopping": 100,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(build_func="default"), runner=runner
    ),
    "tuning_records": "resnet-50-v2-autotuning.json",
}

## Execute tuning

`mod` and `params` are the outputs from `mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)`.
We use `XGBTuner` as tuner to tune our mod.

In [None]:
from tvm.autotvm.tuner import XGBTuner

tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)

# Tune the extracted tasks sequentially.
for i, task in enumerate(tasks):
    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

    # choose tuner
    tuner = "xgb"

    # create tuner
    if tuner == "xgb":
        tuner_obj = XGBTuner(task, loss_type="reg")
    elif tuner == "xgb_knob":
        tuner_obj = XGBTuner(task, loss_type="reg", feature_type="knob")
    elif tuner == "xgb_itervar":
        tuner_obj = XGBTuner(task, loss_type="reg", feature_type="itervar")
    elif tuner == "xgb_curve":
        tuner_obj = XGBTuner(task, loss_type="reg", feature_type="curve")
    elif tuner == "xgb_rank":
        tuner_obj = XGBTuner(task, loss_type="rank")
    elif tuner == "xgb_rank_knob":
        tuner_obj = XGBTuner(task, loss_type="rank", feature_type="knob")
    elif tuner == "xgb_rank_itervar":
        tuner_obj = XGBTuner(task, loss_type="rank", feature_type="itervar")
    elif tuner == "xgb_rank_curve":
        tuner_obj = XGBTuner(task, loss_type="rank", feature_type="curve")
    elif tuner == "xgb_rank_binary":
        tuner_obj = XGBTuner(task, loss_type="rank-binary")
    elif tuner == "xgb_rank_binary_knob":
        tuner_obj = XGBTuner(task, loss_type="rank-binary", feature_type="knob")
    elif tuner == "xgb_rank_binary_itervar":
        tuner_obj = XGBTuner(task, loss_type="rank-binary", feature_type="itervar")
    elif tuner == "xgb_rank_binary_curve":
        tuner_obj = XGBTuner(task, loss_type="rank-binary", feature_type="curve")
    elif tuner == "ga":
        tuner_obj = GATuner(task, pop_size=50)
    elif tuner == "random":
        tuner_obj = RandomTuner(task)
    elif tuner == "gridsearch":
        tuner_obj = GridSearchTuner(task)
    else:
        raise ValueError("Invalid tuner: " + tuner)

    tuner_obj.tune(
        n_trial=min(tuning_option["trials"], len(task.config_space)),
        early_stopping=tuning_option["early_stopping"],
        measure_option=tuning_option["measure_option"],
        callbacks=[
            autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
            autotvm.callback.log_to_file(tuning_option["tuning_records"]),
        ],
    )

# Compiling an Optimized Model with Tuning Data

`autotvm.apply_history_best(tuning_option["tuning_records"])` using the result of tuning to compile the model.

In [None]:
with autotvm.apply_history_best(tuning_option["tuning_records"]):
    with tvm.transform.PassContext(opt_level=3, config={}):
        lib = relay.build(mod, target=target, params=params)

dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))