# TVM Ansor: Template-Free Auto-Tuning for Deep Learning Models

官网 doc: https://tvm.apache.org/docs/how_to/tune_with_autoscheduler/tune_network_x86.html

要点：
1. 使用 ansor (tvm.auto_scheduler) 自动优化 AlexNet 模型。
2. 搜索 5min，模型加速 2x。硬件: Intel(R) Xeon(R) Gold 5320 CPU。

备注：

1. 模型优化相关的代码，一共不到 10 行代码。上手非常简单。
2. ansor 是 template free 的优化器，不要求理解模型和硬件。新人友好。
3. 官网文档：this tutorial will not run on Windows or recent versions of macOS。实测，Mac M1 确实跑不了。原因未查。

# 1. 准备模型 & 测试数据

1. 使用 Pytorch 的 AlexNet。最简单的 CNN 模型。
2. 更大的模型也可以，但 ansor 的搜索时间更久，也不方便对比优化前后的差异，不适合学习使用。


In [1]:
import tvm
from tvm import relay
from tvm.contrib import graph_executor
# from tvm.ir.module import IRModule

import numpy as np

import torch
import torchvision
from torchvision import transforms

from tvm import auto_scheduler

from PIL import Image

print('tvm versin: %s' % tvm.__version__)

tvm versin: 0.13.dev0


In [2]:
model = torchvision.models.alexnet(weights='IMAGENET1K_V1')
# model = torchvision.models.resnet18(weights='IMAGENET1K_V1')
model = model.eval()
model

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

In [3]:
img_path = 'test-data/cat.png'
img = Image.open(img_path).resize((224, 224))

my_preprocess = transforms.Compose(
    [
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)
img = my_preprocess(img)
# 新增Batch维度
img = np.expand_dims(img, 0)

In [4]:
# export to TorchScripted model, so that we can use it in TVM
img_input_name = 'input0'
img_shape = [1, 3, 224, 224]
input_shapes = [(img_input_name, img_shape)]

input_data = torch.randn(img_shape)
scripted_model = torch.jit.trace(model, input_data).eval()

In [5]:
with torch.no_grad():
    torch_img = torch.from_numpy(img)
    output = model(torch_img)

    # Get top-1 result for PyTorch
    top1_torch = np.argmax(output.numpy())
    print('Torch top-1 id: %d' % top1_torch)

Torch top-1 id: 285


# 2. baseline 性能 - 只编译，不优化

In [6]:
mod, params = relay.frontend.from_pytorch(scripted_model, input_shapes)

target = 'llvm'
# compile the model
with tvm.transform.PassContext(opt_level=2):
    lib = relay.build(mod, target=target, params=params)

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.


In [7]:
# run the model
def run_model(lib):
    m = graph_executor.GraphModule(lib["default"](tvm.cpu(0)))
    m.set_input(img_input_name, tvm.nd.array(img.astype('float32')))
    m.run()
    tvm_output = m.get_output(0)

    top1_tvm_1 = np.argmax(tvm_output.numpy()[0])
    print("TVM top-1 id: %s" % top1_tvm_1)

    # Evaluate
    print("Evaluate inference time cost...")
    print(m.benchmark(tvm.cpu(0), repeat=3, min_repeat_ms=500))

In [8]:
run_model(lib)

TVM top-1 id: 285
Evaluate inference time cost...
Execution time summary:
 mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
  50.7338      53.8374      56.0268      42.3371       6.0042                  


# 3. 使用 ansor 自动优化

1. ansor 一共生成了 12 个 task。
2. tune_option 的 num_measure_trials 参数，不能小于 task 数，所以，最小是 12 次。
3. num_measure_trials 参数，越大，搜索时间越长，模型性能越好。
4. 本例中，使用最小搜索次数 12 次，搜索 5min，模型加速 2x。

In [9]:
target = tvm.target.Target("llvm -mcpu=skylake-avx512")
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

In [10]:
# example of generated tasks
print(tasks[0].compute_dag)

p0 = PLACEHOLDER [1, 1, 224, 224, 3]
data_pad(i0, i1, i2, i3, i4) = tir.if_then_else(((((i2 >= 2) && (i2 < 226)) && (i3 >= 2)) && (i3 < 226)), p0[i0, i1, (i2 - 2), (i3 - 2), i4], 0f)
p1 = PLACEHOLDER [4, 1, 11, 11, 3, 16]
conv2d_NCHWc(n, oc_chunk, oh, ow, oc_block) += (data_pad[n, floordiv(ic, 3), ((oh*4) + kh), ((ow*4) + kw), floormod(ic, 3)]*p1[oc_chunk, floordiv(ic, 3), kh, kw, floormod(ic, 3), oc_block])
p2 = PLACEHOLDER [1, 4, 1, 1, 16]
T_add(ax0, ax1, ax2, ax3, ax4) = (conv2d_NCHWc[ax0, ax1, ax2, ax3, ax4] + p2[ax0, ax1, 0, 0, ax4])
T_relu(ax0, ax1, ax2, ax3, ax4) = max(T_add[ax0, ax1, ax2, ax3, ax4], 0f)



In [11]:
log_file = "tune-alexnet.json"

tuner = auto_scheduler.TaskScheduler(tasks, task_weights)

tune_option = auto_scheduler.TuningOptions(
    num_measure_trials=max(len(tasks), 1),  # change this to 20000 to achieve the best performance
    runner=auto_scheduler.LocalRunner(repeat=0, enable_cpu_cache_flush=True),
    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
)

In [12]:
# long time to tune. 5min at least
tuner.tune(tune_option)

|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |----------------------------------------------------------------------

-----------------------------------------------------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      0 |
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      0 |
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            - |              - |      0 |
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|    3 |                                  vm_mod_fused_nn_max_pool2d_2 |            - |              - |      0 |
|    4 |                           vm_mod_fused_nn_dense_add_nn_relu_1 |            - |              - |      0 |
|    5 | 

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 2.41 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 4.42 s
----------------------------------------------------------------------
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      1 |
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      0 |
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            -

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 2.10 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.50 s
----------------------------------------------------------------------
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      1 |
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      1 |
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            -

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 2.27 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.87 s
----------------------------------------------------------------------
------------------------------|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      1 |
  [ Task Scheduler ]
----------------------------------------------------------------------
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      1 |
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            -

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 2.00 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.96 s
----------------------------------------------------------------------
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      1 |
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      1 |
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            -

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 2.36 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.63 s
----------------------------------------------------------------------
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      1 |
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      1 |
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            -

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 2.65 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.59 s
----------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      1 |
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      1 |
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            -

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 1.95 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.95 s
----------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      1 |
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      1 |
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            -

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 2.85 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 1.05 s
----------------------------------------------------------------------
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      1 |
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      1 |
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            -

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 2.00 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 3.03 s
----------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      1 |
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      1 |
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            -

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 2.16 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.87 s
----------------------------------------------------------------------
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      1 |
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      1 |
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            -

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 2.25 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.54 s
----------------------------------------------------------------------
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|    0 |              vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu |            - |              - |      1 |
|    1 |                                     vm_mod_fused_nn_dense_add |            - |              - |      1 |
|    2 |            vm_mod_fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 |            -

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


Time elapsed for measurement: 2.25 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.64 s


In [13]:
# compile with history best schedule found by the auto-tuner
with auto_scheduler.ApplyHistoryBest(log_file):
    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
        lib2 = relay.build(mod, target=target, params=params)

In [14]:
# run the tuned best model
run_model(lib2)

TVM top-1 id: 285
Evaluate inference time cost...
Execution time summary:
 mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
  24.3218      24.0514      25.0542      23.8599       0.5237                  


In [15]:
# to compare the performance, re-run model before auto-scheduler
run_model(lib)

TVM top-1 id: 285
Evaluate inference time cost...
Execution time summary:
 mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
  42.3976      42.3999      42.4098      42.3830       0.0111                  
