In [1]:
!pip install apache-tvm==0.13.dev248



In [2]:
import tvm
from tvm import te
from tvm import autotvm
import numpy as np
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner


In [3]:
#pip install --force-reinstall xgboost==1.5.0

In [4]:
import xgboost
print(xgboost.__version__)


1.5.0


In [5]:
dtype = "float32"
M,N,K = 256, 256, 256
target = "llvm -mcpu=core-avx2"

**Run AutoTVM tuning**

In [6]:
def run_tuning(tasks, measure_option, tuner="gridsearch", early_stopping=None,
               log_filename="tuning.log", n_trial=None):
    for i, task in enumerate(tasks):
        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

        # create tuner
        if tuner == "xgb" or tuner == "xgb-rank":
            tuner_obj = XGBTuner(task, loss_type="rank")
        elif tuner == "ga":
            tuner_obj = GATuner(task, pop_size=50)
        elif tuner == "random":
            tuner_obj = RandomTuner(task)
        elif tuner == "gridsearch":
            tuner_obj = GridSearchTuner(task)
        else:
            raise ValueError("Invalid tuner: " + tuner)

        # do tuning
        n_trial = min(n_trial or len(task.config_space), len(task.config_space))
        tuner_obj.tune(
            n_trial=n_trial or len(task.config_space),
            early_stopping=early_stopping,
            measure_option=measure_option,
            callbacks=[
                autotvm.callback.progress_bar(n_trial, prefix=prefix),
                autotvm.callback.log_to_file(log_filename),
            ],
        )

In [7]:
def get_tuning_option(log_file="tune.log"): return {
    "log_filename": log_file,
    "n_trial": 300,
    "tuner": "xgb",
    "early_stopping": None,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(
            number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True
        ),
    ),
}

In [8]:
@autotvm.template("matmul_template_wo_filter")
def matmul_template_wo_filter(N, M, K):
    k = te.reduce_axis((0, K), "k")
    A = te.placeholder((M, K), name="A", dtype=dtype)
    B = te.placeholder((K, N), name="B", dtype=dtype)
    C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C")
    s = te.create_schedule(C.op)
    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]
    ##### define space #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2)
    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)
    s[C].reorder(yo, xo, k, yi, xi)
    return s, [A, B, C]

task1 = autotvm.task.create("matmul_template_wo_filter", args=(M, N, K), target=target)
print(task1.config_space)
for idx in range(task1.config_space.range_length):
    if task1.config_space.is_index_valid(idx):
        print(task1.config_space.get(idx))


ConfigSpace (len=81, range_length=81, space_map=
   0 tile_y: Split(policy=factors, product=256, num_outputs=2) len=9
   1 tile_x: Split(policy=factors, product=256, num_outputs=2) len=9
)
[('tile_y', [-1, 1]), ('tile_x', [-1, 1])],None,0
[('tile_y', [-1, 2]), ('tile_x', [-1, 1])],None,1
[('tile_y', [-1, 4]), ('tile_x', [-1, 1])],None,2
[('tile_y', [-1, 8]), ('tile_x', [-1, 1])],None,3
[('tile_y', [-1, 16]), ('tile_x', [-1, 1])],None,4
[('tile_y', [-1, 32]), ('tile_x', [-1, 1])],None,5
[('tile_y', [-1, 64]), ('tile_x', [-1, 1])],None,6
[('tile_y', [-1, 128]), ('tile_x', [-1, 1])],None,7
[('tile_y', [-1, 256]), ('tile_x', [-1, 1])],None,8
[('tile_y', [-1, 1]), ('tile_x', [-1, 2])],None,9
[('tile_y', [-1, 2]), ('tile_x', [-1, 2])],None,10
[('tile_y', [-1, 4]), ('tile_x', [-1, 2])],None,11
[('tile_y', [-1, 8]), ('tile_x', [-1, 2])],None,12
[('tile_y', [-1, 16]), ('tile_x', [-1, 2])],None,13
[('tile_y', [-1, 32]), ('tile_x', [-1, 2])],None,14
[('tile_y', [-1, 64]), ('tile_x', [-1, 2])],Non

In [9]:
@autotvm.template("matmul_template_with_filter")
def matmul_template_with_filter(N, M, K):
    k = te.reduce_axis((0, K), "k")
    A = te.placeholder((M, K), name="A", dtype=dtype)
    B = te.placeholder((K, N), name="B", dtype=dtype)
    C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C")
    s = te.create_schedule(C.op)
    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]
    ##### define space #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2, filter= lambda v: v.size[1] >= 32)
    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)
    s[C].reorder(yo, xo, k, yi, xi)
    return s, [A, B, C]


task2 = autotvm.task.create("matmul_template_with_filter", args=(M, N, K), target=target)
print(task2.config_space)
for idx in range(task2.config_space.range_length):
    if task2.config_space.is_index_valid(idx):
        print(task2.config_space.get(idx))


ConfigSpace (len=36, range_length=36, space_map=
   0 tile_y: Split(policy=factors, product=256, num_outputs=2) len=9
   1 tile_x: Split(policy=factors, product=256, num_outputs=2) len=4
)
[('tile_y', [-1, 1]), ('tile_x', [-1, 32])],None,0
[('tile_y', [-1, 2]), ('tile_x', [-1, 32])],None,1
[('tile_y', [-1, 4]), ('tile_x', [-1, 32])],None,2
[('tile_y', [-1, 8]), ('tile_x', [-1, 32])],None,3
[('tile_y', [-1, 16]), ('tile_x', [-1, 32])],None,4
[('tile_y', [-1, 32]), ('tile_x', [-1, 32])],None,5
[('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,6
[('tile_y', [-1, 128]), ('tile_x', [-1, 32])],None,7
[('tile_y', [-1, 256]), ('tile_x', [-1, 32])],None,8
[('tile_y', [-1, 1]), ('tile_x', [-1, 64])],None,9
[('tile_y', [-1, 2]), ('tile_x', [-1, 64])],None,10
[('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,11
[('tile_y', [-1, 8]), ('tile_x', [-1, 64])],None,12
[('tile_y', [-1, 16]), ('tile_x', [-1, 64])],None,13
[('tile_y', [-1, 32]), ('tile_x', [-1, 64])],None,14
[('tile_y', [-1, 64]), ('tile_x'

In [10]:
@autotvm.template("matmul_template_with_multi_filter")
def matmul_template_with_multi_filter(N, M, K):
    k = te.reduce_axis((0, K), "k")
    A = te.placeholder((M, K), name="A", dtype=dtype)
    B = te.placeholder((K, N), name="B", dtype=dtype)
    C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C")
    s = te.create_schedule(C.op)
    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]
    ##### define space #####
    cfg = autotvm.get_config()
    cfg.define_split("tile_y", y, num_outputs=2)
    cfg.define_split("tile_x", x, num_outputs=2, filter= lambda v: v.size[1] >= 32)
    cfg.multi_filter(filter=lambda e: e["tile_x"].size[1] > e["tile_y"].size[1])
    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)
    s[C].reorder(yo, xo, k, yi, xi)
    return s, [A, B, C]

task3 = autotvm.task.create("matmul_template_with_multi_filter", args=(M, N, K), target=target)
print(task3.config_space)
for idx in range(task3.config_space.range_length):
    if task3.config_space.is_index_valid(idx):
        print(task3.config_space.get(idx))


ConfigSpace (len=26, range_length=36, space_map=
   0 tile_y: Split(policy=factors, product=256, num_outputs=2) len=9
   1 tile_x: Split(policy=factors, product=256, num_outputs=2) len=4
)
[('tile_y', [-1, 1]), ('tile_x', [-1, 32])],None,0
[('tile_y', [-1, 2]), ('tile_x', [-1, 32])],None,1
[('tile_y', [-1, 4]), ('tile_x', [-1, 32])],None,2
[('tile_y', [-1, 8]), ('tile_x', [-1, 32])],None,3
[('tile_y', [-1, 16]), ('tile_x', [-1, 32])],None,4
[('tile_y', [-1, 1]), ('tile_x', [-1, 64])],None,9
[('tile_y', [-1, 2]), ('tile_x', [-1, 64])],None,10
[('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,11
[('tile_y', [-1, 8]), ('tile_x', [-1, 64])],None,12
[('tile_y', [-1, 16]), ('tile_x', [-1, 64])],None,13
[('tile_y', [-1, 32]), ('tile_x', [-1, 64])],None,14
[('tile_y', [-1, 1]), ('tile_x', [-1, 128])],None,18
[('tile_y', [-1, 2]), ('tile_x', [-1, 128])],None,19
[('tile_y', [-1, 4]), ('tile_x', [-1, 128])],None,20
[('tile_y', [-1, 8]), ('tile_x', [-1, 128])],None,21
[('tile_y', [-1, 16]), ('tile_

In [11]:
tasks = [task1]
run_tuning(tasks, **get_tuning_option("log_file_1.log"))

[Task  1/ 1]  Current/Best:    3.81/  29.74 GFLOPS | Progress: (81/81) | 73.57 s Done.


In [12]:
dispatch_context = autotvm.apply_history_best("log_file_1.log")
best_config = dispatch_context.query(task1.target, task1.workload)

print("\nBest config:")
print(best_config)
with dispatch_context:
    with tvm.target.Target(target):
      s, (A, B, C) = matmul_template_wo_filter(M, N, K)
      func = tvm.build(s, [A, B, C], target=target, name="matmul")
      lowered_func = tvm.lower(s, [A, B, C])
      print(lowered_func)


Best config:
[('tile_y', [-1, 1]), ('tile_x', [-1, 64])],None,54
# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((256, 256), "float32"), B: T.Buffer((256, 256), "float32"), C: T.Buffer((256, 256), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
        for m_outer, n_outer in T.grid(256, 4):
            C_1 = T.Buffer((65536,), data=C.data)
            for n_inner_init in range(64):
                C_1[m_outer * 256 + n_outer * 64 + n_inner_init] = T.float32(0)
            for k, n_inner in T.grid(256, 64):
                cse_var_3: T.int32 = m_outer * 256
                cse_var_2: T.int32 = n_outer * 64
                cse_var_1: T.int32 = cse_var_3 + cse_var_2 + n_inner
                A_1 = T.Buffer((65536,), data=A.data)
                B_1 = T.Buffer((65536,), data=B.data)
                C_1[cse_var_1] = C_1[

In [13]:
tasks = [task2]
run_tuning(tasks, **get_tuning_option("log_file_2.log"))

[Task  1/ 1]  Current/Best:   18.12/  29.81 GFLOPS | Progress: (36/36) | 30.59 s Done.


In [14]:
dispatch_context = autotvm.apply_history_best("log_file_2.log")
best_config = dispatch_context.query(task2.target, task2.workload)
print("\nBest config:")
print(best_config)
with dispatch_context:
    with tvm.target.Target(target):
        s, (A, B, C) = matmul_template_with_filter(M, N, K)
        func = tvm.build(s, [A, B, C], target=target, name="matmul")
        lowered_func = tvm.lower(s, [A, B, C])
        print(lowered_func)


Best config:
[('tile_y', [-1, 1]), ('tile_x', [-1, 64])],None,9
# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((256, 256), "float32"), B: T.Buffer((256, 256), "float32"), C: T.Buffer((256, 256), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
        for m_outer, n_outer in T.grid(256, 4):
            C_1 = T.Buffer((65536,), data=C.data)
            for n_inner_init in range(64):
                C_1[m_outer * 256 + n_outer * 64 + n_inner_init] = T.float32(0)
            for k, n_inner in T.grid(256, 64):
                cse_var_3: T.int32 = m_outer * 256
                cse_var_2: T.int32 = n_outer * 64
                cse_var_1: T.int32 = cse_var_3 + cse_var_2 + n_inner
                A_1 = T.Buffer((65536,), data=A.data)
                B_1 = T.Buffer((65536,), data=B.data)
                C_1[cse_var_1] = C_1[c

In [15]:
tasks = [task3]
run_tuning(tasks, **get_tuning_option("log_file_3.log"))

[Task  1/ 1]  Current/Best:   22.32/  29.72 GFLOPS | Progress: (26/26) | 28.49 s Done.


In [16]:
dispatch_context = autotvm.apply_history_best("log_file_3.log")
best_config = dispatch_context.query(task3.target, task3.workload)
print("\nBest config:")
print(best_config)
with dispatch_context:
    with tvm.target.Target(target):
        s, (A, B, C) = matmul_template_with_multi_filter(M, N, K)
        func = tvm.build(s, [A, B, C], target=target, name="matmul")
        lowered_func = tvm.lower(s, [A, B, C])
        print(lowered_func)


Best config:
[('tile_y', [-1, 1]), ('tile_x', [-1, 32])],None,0
# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer((256, 256), "float32"), B: T.Buffer((256, 256), "float32"), C: T.Buffer((256, 256), "float32")):
        T.func_attr({"from_legacy_te_schedule": T.bool(True), "global_symbol": "main", "tir.noalias": T.bool(True)})
        for m_outer, n_outer in T.grid(256, 8):
            C_1 = T.Buffer((65536,), data=C.data)
            for n_inner_init in range(32):
                C_1[m_outer * 256 + n_outer * 32 + n_inner_init] = T.float32(0)
            for k, n_inner in T.grid(256, 32):
                cse_var_3: T.int32 = m_outer * 256
                cse_var_2: T.int32 = n_outer * 32
                cse_var_1: T.int32 = cse_var_3 + cse_var_2 + n_inner
                A_1 = T.Buffer((65536,), data=A.data)
                B_1 = T.Buffer((65536,), data=B.data)
                C_1[cse_var_1] = C_1[c

In [17]:
dev = tvm.cpu(0)
a = np.random.uniform(size=(M, K)).astype("float32")
b = np.random.uniform(size=(K, N)).astype("float32")
c_tvm = tvm.nd.array(np.zeros((M, N), dtype="float32"), dev)
c_np = np.matmul(a, b)
a = tvm.nd.array(a)
b = tvm.nd.array(b)

In [18]:
time_f = func.time_evaluator(func.entry_name, dev, number=10)
cost = time_f(a,b,c_tvm).mean
print("func", cost*1000)

func 1.6896211


In [19]:
print(func.get_source())


; ModuleID = 'TVMMod'
source_filename = "TVMMod"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%0 = type { i8*, %1, i32, %2, i64*, i64*, i64 }
%1 = type { i32, i32 }
%2 = type { i8, i8, i16 }

@__TVMAPISetLastError = linkonce dllexport local_unnamed_addr global void (i8*)* null, align 8
@.str = private constant [57 x i8] c"Assert fail: num_args == 3, matmul: num_args should be 3\00", align 1
@.str.1 = private constant [108 x i8] c"Assert fail: A_code == 3 or A_code == 13 or A_code == 7 or A_code == 4, matmul: Expect arg[0] to be pointer\00", align 1
@.str.2 = private constant [108 x i8] c"Assert fail: B_code == 3 or B_code == 13 or B_code == 7 or B_code == 4, matmul: Expect arg[1] to be pointer\00", align 1
@.str.3 = private constant [108 x i8] c"Assert fail: C_code == 3 or C_code == 13 or C_code == 7 or C_code == 4, matmul: Expect arg[2] to be pointer\00", align 1
@.str.4 = private constant [91 