# AutoTVM: Tune with Templates

官网 doc: https://tvm.apache.org/docs/how_to/tune_with_autotvm/index.html

要点:
1. 使用 autotvm 模块优化 AlexNet 模型。与 [TVM Ansor: Template-Free Auto-Tuning](https://github.com/JackonYang/hands-on-tvm/blob/main/tutorial-and-how-to/02-tvm-ansor-template-free-tune.ipynb) 用的模型相同.
2. autotvm 的 tune 分 2 步：(1) tune kernel; (2) tune graph.
3. 总计 tune 约 13min，模型加速 ~5 倍。硬件: Intel(R) Xeon(R) Gold 5320 CPU。

备注:

1. 虽然，原理上需要写 template，但 CPU/GPU 上，使用默认 template，无脑暴力搜，结果也不错。
2. 搜 kernel 的 tuner_obj，& 搜 graph 的 Tuner，都有多个可选工具，差别暂不清楚。感觉，算力够的话，可以逐个 apply 选最终的 best。
3. 官网文档：this tutorial will not run on Windows or recent versions of macOS。实测，Mac M1 确实跑不了。原因未查。

# 1. 准备模型 & 测试数据

1. 使用 Pytorch 的 AlexNet。最简单的 CNN 模型。
2. 更大的模型也可以，但 ansor 的搜索时间更久，也不方便对比优化前后的差异，不适合学习使用。


In [1]:
import tvm
from tvm import relay
from tvm.contrib import graph_executor
# from tvm.ir.module import IRModule

import numpy as np

import torch
import torchvision
from torchvision import transforms

# required by autotvm
from tvm import autotvm
# kernel tunners of autotvm
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
# graph tuners of autotvm
from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner

from PIL import Image

print('tvm versin: %s' % tvm.__version__)

tvm versin: 0.13.dev0


In [2]:
model = torchvision.models.alexnet(weights='IMAGENET1K_V1')
# model = torchvision.models.resnet18(weights='IMAGENET1K_V1')
model = model.eval()
model

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

In [3]:
img_path = 'test-data/cat.png'
img = Image.open(img_path).resize((224, 224))

my_preprocess = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)
img = my_preprocess(img)
# 新增Batch维度
img = np.expand_dims(img, 0)

In [4]:
# export to TorchScripted model, so that we can use it in TVM
img_input_name = 'input0'
img_shape = [1, 3, 224, 224]
input_shapes = [(img_input_name, img_shape)]

input_data = torch.randn(img_shape)
scripted_model = torch.jit.trace(model, input_data).eval()

In [5]:
with torch.no_grad():
    torch_img = torch.from_numpy(img)
    output = model(torch_img)

    # Get top-1 result for PyTorch
    top1_torch = np.argmax(output.numpy())
    print('Torch top-1 id: %d' % top1_torch)

Torch top-1 id: 285


# 2. baseline 性能 - 只编译，不优化

In [6]:
mod, params = relay.frontend.from_pytorch(scripted_model, input_shapes)

target = 'llvm'
# compile the model
with tvm.transform.PassContext(opt_level=2):
    lib = relay.build(mod, target=target, params=params)

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.


In [7]:
# run the model
def run_model(lib):
    m = graph_executor.GraphModule(lib["default"](tvm.cpu(0)))
    m.set_input(img_input_name, tvm.nd.array(img.astype('float32')))
    m.run()
    tvm_output = m.get_output(0)

    top1_tvm_1 = np.argmax(tvm_output.numpy()[0])
    print("TVM top-1 id: %s" % top1_tvm_1)

    # Evaluate
    print("Evaluate inference time cost...")
    print(m.benchmark(tvm.cpu(0), repeat=3, min_repeat_ms=500))

In [8]:
run_model(lib)

TVM top-1 id: 285
Evaluate inference time cost...
Execution time summary:
 mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
  42.3022      42.2435      42.4208      42.2422       0.0839                  


# 3. 使用 AutoTVM 优化模型

分 4 步：

1. 构造 tasks. API 是 `autotvm.task.extract_from_program`。
2. tune kernel (operator)
3. tune graph
4. apply best schedule

In [9]:
target = "llvm"
tasks = autotvm.task.extract_from_program(
    mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
)

In [10]:
# example of generated tasks
print(len(tasks), type(tasks[0]))
print(tasks[0].config_space)

5 <class 'tvm.autotvm.task.task.Task'>
ConfigSpace (len=252, range_length=252, space_map=
   0 tile_ic: Split(policy=factors, product=3, num_outputs=2) len=2
   1 tile_oc: Split(policy=factors, product=64, num_outputs=2) len=7
   2 tile_ow: Split(policy=verbose, product=55, num_outputs=2) len=9
   3 unroll_kw: OtherOption([True, False]) len=2
)


In [11]:
# tune kernels
log_file = "tune-autotvm-alexnet.json"

measure_option = autotvm.measure_option(
    builder=autotvm.LocalBuilder(),
    runner=autotvm.LocalRunner(
        number=1, repeat=1, min_repeat_ms=0, enable_cpu_cache_flush=True
    ),
)

for i, task in enumerate(tasks):
    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
    tuner_obj = XGBTuner(task, loss_type="rank")

    n_trial = min(30, len(task.config_space))
    tuner_obj.tune(
        n_trial=n_trial,
        early_stopping=None,
        measure_option=measure_option,
        callbacks=[
            autotvm.callback.progress_bar(n_trial, prefix=prefix),
            autotvm.callback.log_to_file(log_file),
        ],
    )

[Task  1/ 5]  Current/Best:  171.69/ 545.96 GFLOPS | Progress: (30/30) | 39.33 s Done.
[Task  2/ 5]  Current/Best:  102.52/ 589.67 GFLOPS | Progress: (30/30) | 29.11 s Done.
[Task  3/ 5]  Current/Best:  461.77/ 530.50 GFLOPS | Progress: (30/30) | 42.68 s Done.
[Task  4/ 5]  Current/Best:   90.55/ 365.76 GFLOPS | Progress: (30/30) | 23.37 s Done.
[Task  5/ 5]  Current/Best:  514.72/ 514.72 GFLOPS | Progress: (30/30) | 17.84 s Done.


In [12]:
# cost 8min if use_DP = True
# cost 10min if use_DP = False
use_DP = False
input_shape_dict = {img_input_name: img_shape}

graph_opt_sch_file = "tune-autotvm-alexnet-graph_opt.log"

target_op = [
    relay.op.get("nn.conv2d"),
]
Tuner = DPTuner if use_DP else PBQPTuner
tuner = Tuner(mod['main'], input_shape_dict, log_file, target_op, target)
tuner.benchmark_layout_transform(min_exec_num=2)
tuner.run()
tuner.write_opt_sch2record_file(graph_opt_sch_file)

2023-06-15 17:04:28,819 INFO Start to benchmark layout transformation...
2023-06-15 17:13:51,695 INFO Benchmarking layout transformation successful.
2023-06-15 17:14:05,594 INFO Start to run PBQP algorithm...
2023-06-15 17:14:05,597 INFO Finished PBQPExecutor run. Got optimal solution.
2023-06-15 17:14:05,598 INFO Writing optimal schedules to tune-autotvm-alexnet-graph_opt.log successfully.


In [13]:
# compile with history best schedule found by autotvm
with autotvm.apply_history_best(log_file):
    with tvm.transform.PassContext(opt_level=3):
        lib2 = relay.build(mod, target=target, params=params)

In [14]:
# run the tuned best model
run_model(lib2)

TVM top-1 id: 285
Evaluate inference time cost...
Execution time summary:
 mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
   8.7378       8.7448       8.7527       8.7158       0.0159                  


In [15]:
# to compare the performance, re-run model before auto-scheduler
run_model(lib)

TVM top-1 id: 285
Evaluate inference time cost...
Execution time summary:
 mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
  42.3053      42.3333      42.3391      42.2435       0.0438                  
