In [1]:
!cat /proc/cpuinfo | grep flags

flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ssbd ibrs ibpb stibp vmmcall fsgsbase tsc_adjust bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr arat npt nrip_save umip rdpid
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ssbd ibr

In [1]:
!pip install apache-tvm



In [2]:
!pip install psutil cloudpickle



In [1]:
!pip install xgboost==1.5.0



In [1]:
import tvm
from tvm import te
from tvm import autotvm
import numpy as np
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner

In [2]:
import logging
import sys
# logging config (for printing tuning log to the screen)
logging.getLogger("autotvm").setLevel(logging.DEBUG)
logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))

In [3]:
def run_tuning(tasks, measure_option, tuner="gridsearch", early_stopping=None,
               log_filename="tuning.log", n_trial=None):
    for i, task in enumerate(tasks):
        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

        # create tuner
        if tuner == "xgb" or tuner == "xgb-rank":
            tuner_obj = XGBTuner(task, loss_type="rank")
        elif tuner == "ga":
            tuner_obj = GATuner(task, pop_size=50)
        elif tuner == "random":
            tuner_obj = RandomTuner(task)
        elif tuner == "gridsearch":
            tuner_obj = GridSearchTuner(task)
        else:
            raise ValueError("Invalid tuner: " + tuner)

        # do tuning
        n_trial = min(n_trial or len(task.config_space), len(task.config_space))
        tuner_obj.tune(
            n_trial=n_trial or len(task.config_space),
            early_stopping=early_stopping,
            measure_option=measure_option,
            callbacks=[
                autotvm.callback.progress_bar(n_trial, prefix=prefix),
                autotvm.callback.log_to_file(log_filename),
            ],
        )
def evaluate_best_from_history(logfile, N, L, M, target):
    # apply history best from log file
    with autotvm.apply_history_best(logfile):
        with tvm.target.Target(target):
            s, arg_bufs = matmul_w_filters(N, L, M, "float32")
            func = tvm.build(s, arg_bufs)
    dev = tvm.cpu(0)

    # check correctness
    a_np = np.random.uniform(size=(N, L)).astype(np.float32)
    b_np = np.random.uniform(size=(L, M)).astype(np.float32)
    c_np = np.matmul(a_np, b_np).astype(np.float32)

    c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=np.float32), dev)
    func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)

    np.testing.assert_allclose(c_tvm.numpy(), c_np, rtol=1e-4)


    time_f = func.time_evaluator(func.entry_name, dev, number=10)
    cost = time_f(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm).mean
    print("func", cost*1000)

In [4]:
N,L,M = 512, 512, 512

In [5]:
@autotvm.template("matmul_w_filters")
def matmul_w_filters(N, L, M, candidates=None):
    A = te.placeholder((N, L), name="A", dtype="float32")
    B = te.placeholder((L, M), name="B", dtype="float32")

    k = te.reduce_axis((0, L), name="k")
    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
    s = te.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    ##### define space begin #####
    cfg = autotvm.get_config()
    filter = lambda v: v.size[0] != 2 and v.size[1] != 2
    multi_filter = lambda e: 32 <= (e["tile_x"].size[1] + e["tile_y"].size[1]) < 128
    cfg.multi_filter(multi_filter)
    candidates = [[4, 128], [8, 64], [16, 32], [32, 16], [64, 8], [128, 4], [2, 256], [256, 2]]

    cfg.define_split("tile_y", y, num_outputs=2, policy="candidate", candidate=candidates, filter=filter)
    cfg.define_split("tile_x", x, num_outputs=2, policy="candidate", candidate=candidates, filter=filter)
    ##### define space end #####

    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]

In [6]:
t3 = autotvm.task.create("matmul_w_filters", args=(N, L, M), target="llvm")
for idx in range(t3.config_space.range_length):
    if t3.config_space.is_index_valid(idx):
        print(t3.config_space.get(idx))
to3 = {
    "log_filename": "matmul_w_filters.log",
    "n_trial": None,
    "tuner": "xgb",
    "early_stopping": None,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(
            number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True
        ),
    ),
}

[('tile_y', [16, 32]), ('tile_x', [8, 64])],None,8
[('tile_y', [32, 16]), ('tile_x', [8, 64])],None,9
[('tile_y', [64, 8]), ('tile_x', [8, 64])],None,10
[('tile_y', [128, 4]), ('tile_x', [8, 64])],None,11
[('tile_y', [8, 64]), ('tile_x', [16, 32])],None,13
[('tile_y', [16, 32]), ('tile_x', [16, 32])],None,14
[('tile_y', [32, 16]), ('tile_x', [16, 32])],None,15
[('tile_y', [64, 8]), ('tile_x', [16, 32])],None,16
[('tile_y', [128, 4]), ('tile_x', [16, 32])],None,17
[('tile_y', [8, 64]), ('tile_x', [32, 16])],None,19
[('tile_y', [16, 32]), ('tile_x', [32, 16])],None,20
[('tile_y', [32, 16]), ('tile_x', [32, 16])],None,21
[('tile_y', [8, 64]), ('tile_x', [64, 8])],None,25
[('tile_y', [16, 32]), ('tile_x', [64, 8])],None,26
[('tile_y', [8, 64]), ('tile_x', [128, 4])],None,31
[('tile_y', [16, 32]), ('tile_x', [128, 4])],None,32


In [7]:
print(t3.config_space)
run_tuning([t3], **to3)
evaluate_best_from_history("matmul_w_filters.log", N, L, M, "llvm")

ConfigSpace (len=16, range_length=36, space_map=
   0 tile_y: Split(policy=candidate, product=512, num_outputs=2) len=6
   1 tile_x: Split(policy=candidate, product=512, num_outputs=2) len=6
)
waiting for device...


DEBUG:autotvm:waiting for device...


device available


DEBUG:autotvm:device available


Get devices for measurement successfully!


INFO:autotvm:Get devices for measurement successfully!


No: 1	GFLOPS: 6.06/6.06	result: MeasureResult(costs=(0.043581309, 0.043786388, 0.043959477999999996, 0.044035619, 0.044193038999999996, 0.044230418, 0.044606859, 0.045693208), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.8413281440734863, timestamp=1713769078.9253442)	[('tile_y', [16, 32]), ('tile_x', [32, 16])],None,20


DEBUG:autotvm:No: 1	GFLOPS: 6.06/6.06	result: MeasureResult(costs=(0.043581309, 0.043786388, 0.043959477999999996, 0.044035619, 0.044193038999999996, 0.044230418, 0.044606859, 0.045693208), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.8413281440734863, timestamp=1713769078.9253442)	[('tile_y', [16, 32]), ('tile_x', [32, 16])],None,20


No: 2	GFLOPS: 12.76/12.76	result: MeasureResult(costs=(0.020788809000000002, 0.020789839, 0.020795209, 0.020916099, 0.020924729, 0.020972789, 0.021342719, 0.021724708), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6692953109741211, timestamp=1713769079.3868809)	[('tile_y', [32, 16]), ('tile_x', [16, 32])],None,15


DEBUG:autotvm:No: 2	GFLOPS: 12.76/12.76	result: MeasureResult(costs=(0.020788809000000002, 0.020789839, 0.020795209, 0.020916099, 0.020924729, 0.020972789, 0.021342719, 0.021724708), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6692953109741211, timestamp=1713769079.3868809)	[('tile_y', [32, 16]), ('tile_x', [16, 32])],None,15


No: 3	GFLOPS: 14.92/14.92	result: MeasureResult(costs=(0.017549229, 0.017580500000000002, 0.017699219, 0.01775915, 0.017937309, 0.018016319, 0.018065968999999998, 0.019368669), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4879188537597656, timestamp=1713769079.9901636)	[('tile_y', [64, 8]), ('tile_x', [8, 64])],None,10


DEBUG:autotvm:No: 3	GFLOPS: 14.92/14.92	result: MeasureResult(costs=(0.017549229, 0.017580500000000002, 0.017699219, 0.01775915, 0.017937309, 0.018016319, 0.018065968999999998, 0.019368669), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4879188537597656, timestamp=1713769079.9901636)	[('tile_y', [64, 8]), ('tile_x', [8, 64])],None,10


No: 4	GFLOPS: 5.24/14.92	result: MeasureResult(costs=(0.050640048, 0.050999378, 0.051015768, 0.051047178000000006, 0.051110719000000006, 0.051324558000000006, 0.051646988, 0.051730327), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.8103337287902832, timestamp=1713769080.7861545)	[('tile_y', [16, 32]), ('tile_x', [64, 8])],None,26


DEBUG:autotvm:No: 4	GFLOPS: 5.24/14.92	result: MeasureResult(costs=(0.050640048, 0.050999378, 0.051015768, 0.051047178000000006, 0.051110719000000006, 0.051324558000000006, 0.051646988, 0.051730327), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.8103337287902832, timestamp=1713769080.7861545)	[('tile_y', [16, 32]), ('tile_x', [64, 8])],None,26


No: 5	GFLOPS: 1.54/14.92	result: MeasureResult(costs=(0.167285834, 0.16919853499999998, 0.171590634, 0.176906553, 0.177236214, 0.177261513, 0.178341984, 0.17932224400000002), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.156050443649292, timestamp=1713769083.0800533)	[('tile_y', [8, 64]), ('tile_x', [128, 4])],None,31


DEBUG:autotvm:No: 5	GFLOPS: 1.54/14.92	result: MeasureResult(costs=(0.167285834, 0.16919853499999998, 0.171590634, 0.176906553, 0.177236214, 0.177261513, 0.178341984, 0.17932224400000002), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.156050443649292, timestamp=1713769083.0800533)	[('tile_y', [8, 64]), ('tile_x', [128, 4])],None,31


No: 6	GFLOPS: 6.17/14.92	result: MeasureResult(costs=(0.043219309000000004, 0.043243879, 0.043308609, 0.043377368, 0.043387008000000005, 0.043461199, 0.043626829, 0.044400247999999996), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7377691268920898, timestamp=1713769083.7883346)	[('tile_y', [32, 16]), ('tile_x', [32, 16])],None,21


DEBUG:autotvm:No: 6	GFLOPS: 6.17/14.92	result: MeasureResult(costs=(0.043219309000000004, 0.043243879, 0.043308609, 0.043377368, 0.043387008000000005, 0.043461199, 0.043626829, 0.044400247999999996), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7377691268920898, timestamp=1713769083.7883346)	[('tile_y', [32, 16]), ('tile_x', [32, 16])],None,21


No: 7	GFLOPS: 21.65/21.65	result: MeasureResult(costs=(0.011964919999999999, 0.012009368999999999, 0.012078439, 0.01214116, 0.01236607, 0.012495519, 0.012793480000000001, 0.013353170000000001), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.45249295234680176, timestamp=1713769084.3598602)	[('tile_y', [16, 32]), ('tile_x', [8, 64])],None,8


DEBUG:autotvm:No: 7	GFLOPS: 21.65/21.65	result: MeasureResult(costs=(0.011964919999999999, 0.012009368999999999, 0.012078439, 0.01214116, 0.01236607, 0.012495519, 0.012793480000000001, 0.013353170000000001), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.45249295234680176, timestamp=1713769084.3598602)	[('tile_y', [16, 32]), ('tile_x', [8, 64])],None,8


No: 8	GFLOPS: 14.34/21.65	result: MeasureResult(costs=(0.018516888999999998, 0.018518249, 0.018569219, 0.018712748999999997, 0.018772599, 0.018799709, 0.01890402, 0.018974649), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4725377559661865, timestamp=1713769084.7996094)	[('tile_y', [128, 4]), ('tile_x', [8, 64])],None,11


DEBUG:autotvm:No: 8	GFLOPS: 14.34/21.65	result: MeasureResult(costs=(0.018516888999999998, 0.018518249, 0.018569219, 0.018712748999999997, 0.018772599, 0.018799709, 0.01890402, 0.018974649), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4725377559661865, timestamp=1713769084.7996094)	[('tile_y', [128, 4]), ('tile_x', [8, 64])],None,11


No: 9	GFLOPS: 4.94/21.65	result: MeasureResult(costs=(0.053487489, 0.053526568, 0.053960388000000005, 0.054446898, 0.054728876999999995, 0.054812928, 0.054903518, 0.055027068), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.8576102256774902, timestamp=1713769085.780806)	[('tile_y', [8, 64]), ('tile_x', [64, 8])],None,25


DEBUG:autotvm:No: 9	GFLOPS: 4.94/21.65	result: MeasureResult(costs=(0.053487489, 0.053526568, 0.053960388000000005, 0.054446898, 0.054728876999999995, 0.054812928, 0.054903518, 0.055027068), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.8576102256774902, timestamp=1713769085.780806)	[('tile_y', [8, 64]), ('tile_x', [64, 8])],None,25


No: 10	GFLOPS: 18.06/21.65	result: MeasureResult(costs=(0.014425058999999999, 0.014476570000000001, 0.01452146, 0.014656269, 0.014669579, 0.014758859, 0.01507099, 0.01629854), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4300246238708496, timestamp=1713769086.181282)	[('tile_y', [16, 32]), ('tile_x', [16, 32])],None,14


DEBUG:autotvm:No: 10	GFLOPS: 18.06/21.65	result: MeasureResult(costs=(0.014425058999999999, 0.014476570000000001, 0.01452146, 0.014656269, 0.014669579, 0.014758859, 0.01507099, 0.01629854), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4300246238708496, timestamp=1713769086.181282)	[('tile_y', [16, 32]), ('tile_x', [16, 32])],None,14


No: 11	GFLOPS: 14.64/21.65	result: MeasureResult(costs=(0.017948509, 0.018045179, 0.018079579, 0.0181058, 0.018120379, 0.018343989, 0.01867965, 0.01936802), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.44046521186828613, timestamp=1713769086.7411864)	[('tile_y', [128, 4]), ('tile_x', [16, 32])],None,17


DEBUG:autotvm:No: 11	GFLOPS: 14.64/21.65	result: MeasureResult(costs=(0.017948509, 0.018045179, 0.018079579, 0.0181058, 0.018120379, 0.018343989, 0.01867965, 0.01936802), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.44046521186828613, timestamp=1713769086.7411864)	[('tile_y', [128, 4]), ('tile_x', [16, 32])],None,17


No: 12	GFLOPS: 1.67/21.65	result: MeasureResult(costs=(0.106459466, 0.11715215600000001, 0.16326343399999999, 0.173407874, 0.174239184, 0.182825643, 0.183710533, 0.18699216400000002), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.974527359008789, timestamp=1713769088.7125788)	[('tile_y', [16, 32]), ('tile_x', [128, 4])],None,32


DEBUG:autotvm:No: 12	GFLOPS: 1.67/21.65	result: MeasureResult(costs=(0.106459466, 0.11715215600000001, 0.16326343399999999, 0.173407874, 0.174239184, 0.182825643, 0.183710533, 0.18699216400000002), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.974527359008789, timestamp=1713769088.7125788)	[('tile_y', [16, 32]), ('tile_x', [128, 4])],None,32


No: 13	GFLOPS: 10.08/21.65	result: MeasureResult(costs=(0.02238343, 0.026700799, 0.026947088999999997, 0.027142779000000002, 0.027150819, 0.027156998999999998, 0.027296078999999997, 0.028162509), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7167398929595947, timestamp=1713769089.801556)	[('tile_y', [64, 8]), ('tile_x', [16, 32])],None,16


DEBUG:autotvm:No: 13	GFLOPS: 10.08/21.65	result: MeasureResult(costs=(0.02238343, 0.026700799, 0.026947088999999997, 0.027142779000000002, 0.027150819, 0.027156998999999998, 0.027296078999999997, 0.028162509), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7167398929595947, timestamp=1713769089.801556)	[('tile_y', [64, 8]), ('tile_x', [16, 32])],None,16


No: 14	GFLOPS: 11.02/21.65	result: MeasureResult(costs=(0.023681819, 0.023698919, 0.023839349, 0.024105769, 0.024355619000000002, 0.024607619, 0.024652909, 0.025927149), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9296784400939941, timestamp=1713769090.303244)	[('tile_y', [32, 16]), ('tile_x', [8, 64])],None,9


DEBUG:autotvm:No: 14	GFLOPS: 11.02/21.65	result: MeasureResult(costs=(0.023681819, 0.023698919, 0.023839349, 0.024105769, 0.024355619000000002, 0.024607619, 0.024652909, 0.025927149), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9296784400939941, timestamp=1713769090.303244)	[('tile_y', [32, 16]), ('tile_x', [8, 64])],None,9


No: 15	GFLOPS: 5.66/21.65	result: MeasureResult(costs=(0.046273258, 0.046980228, 0.047013508, 0.047232998000000005, 0.047691339, 0.047709989, 0.047871898, 0.048502618), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.8208949565887451, timestamp=1713769091.2745175)	[('tile_y', [8, 64]), ('tile_x', [32, 16])],None,19


DEBUG:autotvm:No: 15	GFLOPS: 5.66/21.65	result: MeasureResult(costs=(0.046273258, 0.046980228, 0.047013508, 0.047232998000000005, 0.047691339, 0.047709989, 0.047871898, 0.048502618), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.8208949565887451, timestamp=1713769091.2745175)	[('tile_y', [8, 64]), ('tile_x', [32, 16])],None,19


No: 16	GFLOPS: 17.99/21.65	result: MeasureResult(costs=(0.014753978999999999, 0.014759719000000001, 0.01476933, 0.014806109, 0.014874829999999999, 0.015052139, 0.015163929, 0.015223879000000001), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.48999667167663574, timestamp=1713769091.668098)	[('tile_y', [8, 64]), ('tile_x', [16, 32])],None,13


DEBUG:autotvm:No: 16	GFLOPS: 17.99/21.65	result: MeasureResult(costs=(0.014753978999999999, 0.014759719000000001, 0.01476933, 0.014806109, 0.014874829999999999, 0.015052139, 0.015163929, 0.015223879000000001), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.48999667167663574, timestamp=1713769091.668098)	[('tile_y', [8, 64]), ('tile_x', [16, 32])],None,13


Finish loading 40 records


DEBUG:autotvm:Finish loading 40 records


Cannot find config for target=llvm -keys=cpu , workload=('matmul_w_filters', 512, 512, 512, 'float32'). A fallback configuration is used, which may bring great performance regression.




func 13.444104500000002


In [8]:
t4 = autotvm.task.create("matmul_w_filters", args=(N, L, M), target="llvm -mcpu=core-avx2")
for idx in range(t4.config_space.range_length):
    if t4.config_space.is_index_valid(idx):
        print(t4.config_space.get(idx))
to4 = {
    "log_filename": "matmul_avx2_w_filters.log",
    "n_trial": None,
    "tuner": "xgb",
    "early_stopping": None,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(
            number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True
        ),
    ),
}
print(t4.config_space)


[('tile_y', [16, 32]), ('tile_x', [8, 64])],None,8
[('tile_y', [32, 16]), ('tile_x', [8, 64])],None,9
[('tile_y', [64, 8]), ('tile_x', [8, 64])],None,10
[('tile_y', [128, 4]), ('tile_x', [8, 64])],None,11
[('tile_y', [8, 64]), ('tile_x', [16, 32])],None,13
[('tile_y', [16, 32]), ('tile_x', [16, 32])],None,14
[('tile_y', [32, 16]), ('tile_x', [16, 32])],None,15
[('tile_y', [64, 8]), ('tile_x', [16, 32])],None,16
[('tile_y', [128, 4]), ('tile_x', [16, 32])],None,17
[('tile_y', [8, 64]), ('tile_x', [32, 16])],None,19
[('tile_y', [16, 32]), ('tile_x', [32, 16])],None,20
[('tile_y', [32, 16]), ('tile_x', [32, 16])],None,21
[('tile_y', [8, 64]), ('tile_x', [64, 8])],None,25
[('tile_y', [16, 32]), ('tile_x', [64, 8])],None,26
[('tile_y', [8, 64]), ('tile_x', [128, 4])],None,31
[('tile_y', [16, 32]), ('tile_x', [128, 4])],None,32
ConfigSpace (len=16, range_length=36, space_map=
   0 tile_y: Split(policy=candidate, product=512, num_outputs=2) len=6
   1 tile_x: Split(policy=candidate, product=5

In [9]:
run_tuning([t4], **to4)
evaluate_best_from_history("matmul_avx2_w_filters.log", N, L, M, "llvm -mcpu=core-avx2")

waiting for device...


DEBUG:autotvm:waiting for device...


device available


DEBUG:autotvm:device available


Get devices for measurement successfully!


INFO:autotvm:Get devices for measurement successfully!


No: 1	GFLOPS: 22.67/22.67	result: MeasureResult(costs=(0.01132611, 0.011339799, 0.011376058999999999, 0.011427789, 0.011737829, 0.011759439, 0.012555268999999999, 0.01320079), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.3958144187927246, timestamp=1713769128.7886553)	[('tile_y', [64, 8]), ('tile_x', [16, 32])],None,16


DEBUG:autotvm:No: 1	GFLOPS: 22.67/22.67	result: MeasureResult(costs=(0.01132611, 0.011339799, 0.011376058999999999, 0.011427789, 0.011737829, 0.011759439, 0.012555268999999999, 0.01320079), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.3958144187927246, timestamp=1713769128.7886553)	[('tile_y', [64, 8]), ('tile_x', [16, 32])],None,16


No: 2	GFLOPS: 4.00/22.67	result: MeasureResult(costs=(0.066933558, 0.066965288, 0.06711212700000001, 0.067116268, 0.067215248, 0.067250978, 0.067325397, 0.067342057), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.002739667892456, timestamp=1713769129.7591026)	[('tile_y', [32, 16]), ('tile_x', [32, 16])],None,21


DEBUG:autotvm:No: 2	GFLOPS: 4.00/22.67	result: MeasureResult(costs=(0.066933558, 0.066965288, 0.06711212700000001, 0.067116268, 0.067215248, 0.067250978, 0.067325397, 0.067342057), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.002739667892456, timestamp=1713769129.7591026)	[('tile_y', [32, 16]), ('tile_x', [32, 16])],None,21


No: 3	GFLOPS: 22.29/22.67	result: MeasureResult(costs=(0.009961339999999999, 0.00999129, 0.010366680000000001, 0.011960809, 0.01261337, 0.01366667, 0.013864959999999999, 0.01393489), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.398181676864624, timestamp=1713769130.2753427)	[('tile_y', [64, 8]), ('tile_x', [8, 64])],None,10


DEBUG:autotvm:No: 3	GFLOPS: 22.29/22.67	result: MeasureResult(costs=(0.009961339999999999, 0.00999129, 0.010366680000000001, 0.011960809, 0.01261337, 0.01366667, 0.013864959999999999, 0.01393489), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.398181676864624, timestamp=1713769130.2753427)	[('tile_y', [64, 8]), ('tile_x', [8, 64])],None,10


No: 4	GFLOPS: 10.99/22.67	result: MeasureResult(costs=(0.021147048999999998, 0.022699228999999998, 0.024563129, 0.025183679, 0.025330679, 0.025373779, 0.025403149, 0.025616839), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5058503150939941, timestamp=1713769130.769742)	[('tile_y', [16, 32]), ('tile_x', [16, 32])],None,14


DEBUG:autotvm:No: 4	GFLOPS: 10.99/22.67	result: MeasureResult(costs=(0.021147048999999998, 0.022699228999999998, 0.024563129, 0.025183679, 0.025330679, 0.025373779, 0.025403149, 0.025616839), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5058503150939941, timestamp=1713769130.769742)	[('tile_y', [16, 32]), ('tile_x', [16, 32])],None,14


No: 5	GFLOPS: 3.94/22.67	result: MeasureResult(costs=(0.067705727, 0.06770836799999999, 0.067817228, 0.06789073799999999, 0.067944708, 0.068245447, 0.06853453799999999, 0.068961617), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.032322645187378, timestamp=1713769131.9359424)	[('tile_y', [8, 64]), ('tile_x', [32, 16])],None,19


DEBUG:autotvm:No: 5	GFLOPS: 3.94/22.67	result: MeasureResult(costs=(0.067705727, 0.06770836799999999, 0.067817228, 0.06789073799999999, 0.067944708, 0.068245447, 0.06853453799999999, 0.068961617), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.032322645187378, timestamp=1713769131.9359424)	[('tile_y', [8, 64]), ('tile_x', [32, 16])],None,19


No: 6	GFLOPS: 16.38/22.67	result: MeasureResult(costs=(0.016249139, 0.016274319, 0.016327279, 0.016364369, 0.016390659, 0.016425369, 0.016496389, 0.016537779), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4733574390411377, timestamp=1713769132.347702)	[('tile_y', [32, 16]), ('tile_x', [16, 32])],None,15


DEBUG:autotvm:No: 6	GFLOPS: 16.38/22.67	result: MeasureResult(costs=(0.016249139, 0.016274319, 0.016327279, 0.016364369, 0.016390659, 0.016425369, 0.016496389, 0.016537779), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4733574390411377, timestamp=1713769132.347702)	[('tile_y', [32, 16]), ('tile_x', [16, 32])],None,15


No: 7	GFLOPS: 15.16/22.67	result: MeasureResult(costs=(0.015431298999999999, 0.015961309, 0.016653849000000002, 0.017716099, 0.018274149, 0.018322219, 0.019138869, 0.020167809), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5297520160675049, timestamp=1713769132.9990184)	[('tile_y', [32, 16]), ('tile_x', [8, 64])],None,9


DEBUG:autotvm:No: 7	GFLOPS: 15.16/22.67	result: MeasureResult(costs=(0.015431298999999999, 0.015961309, 0.016653849000000002, 0.017716099, 0.018274149, 0.018322219, 0.019138869, 0.020167809), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5297520160675049, timestamp=1713769132.9990184)	[('tile_y', [32, 16]), ('tile_x', [8, 64])],None,9


No: 8	GFLOPS: 3.66/22.67	result: MeasureResult(costs=(0.07128986799999999, 0.07159409800000001, 0.072333907, 0.073003837, 0.07329897699999999, 0.074185738, 0.074881097, 0.07570308799999999), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.0993173122406006, timestamp=1713769134.0578432)	[('tile_y', [16, 32]), ('tile_x', [32, 16])],None,20


DEBUG:autotvm:No: 8	GFLOPS: 3.66/22.67	result: MeasureResult(costs=(0.07128986799999999, 0.07159409800000001, 0.072333907, 0.073003837, 0.07329897699999999, 0.074185738, 0.074881097, 0.07570308799999999), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.0993173122406006, timestamp=1713769134.0578432)	[('tile_y', [16, 32]), ('tile_x', [32, 16])],None,20


No: 9	GFLOPS: 1.49/22.67	result: MeasureResult(costs=(0.168438984, 0.177780764, 0.179144113, 0.18000017399999999, 0.18186296400000002, 0.184562473, 0.18530575400000002, 0.185498534), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.208862781524658, timestamp=1713769136.3877873)	[('tile_y', [8, 64]), ('tile_x', [128, 4])],None,31


DEBUG:autotvm:No: 9	GFLOPS: 1.49/22.67	result: MeasureResult(costs=(0.168438984, 0.177780764, 0.179144113, 0.18000017399999999, 0.18186296400000002, 0.184562473, 0.18530575400000002, 0.185498534), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.208862781524658, timestamp=1713769136.3877873)	[('tile_y', [8, 64]), ('tile_x', [128, 4])],None,31


No: 10	GFLOPS: 25.32/25.32	result: MeasureResult(costs=(0.010476599999999999, 0.01049334, 0.0105226, 0.010571519, 0.01057176, 0.01065644, 0.010692589, 0.01084299), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.3468184471130371, timestamp=1713769136.7377198)	[('tile_y', [128, 4]), ('tile_x', [16, 32])],None,17


DEBUG:autotvm:No: 10	GFLOPS: 25.32/25.32	result: MeasureResult(costs=(0.010476599999999999, 0.01049334, 0.0105226, 0.010571519, 0.01057176, 0.01065644, 0.010692589, 0.01084299), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.3468184471130371, timestamp=1713769136.7377198)	[('tile_y', [128, 4]), ('tile_x', [16, 32])],None,17


No: 11	GFLOPS: 3.65/25.32	result: MeasureResult(costs=(0.07239840700000001, 0.07273713700000001, 0.072774808, 0.07302959799999999, 0.073653447, 0.07416697700000001, 0.074848657, 0.07498252700000001), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.0713813304901123, timestamp=1713769137.9270024)	[('tile_y', [8, 64]), ('tile_x', [64, 8])],None,25


DEBUG:autotvm:No: 11	GFLOPS: 3.65/25.32	result: MeasureResult(costs=(0.07239840700000001, 0.07273713700000001, 0.072774808, 0.07302959799999999, 0.073653447, 0.07416697700000001, 0.074848657, 0.07498252700000001), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.0713813304901123, timestamp=1713769137.9270024)	[('tile_y', [8, 64]), ('tile_x', [64, 8])],None,25


No: 12	GFLOPS: 1.40/25.32	result: MeasureResult(costs=(0.18017209399999998, 0.185287434, 0.18842747399999998, 0.19232895300000002, 0.19496228300000001, 0.195663083, 0.195927243, 0.19592895300000002), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.3315062522888184, timestamp=1713769140.2458656)	[('tile_y', [16, 32]), ('tile_x', [128, 4])],None,32


DEBUG:autotvm:No: 12	GFLOPS: 1.40/25.32	result: MeasureResult(costs=(0.18017209399999998, 0.185287434, 0.18842747399999998, 0.19232895300000002, 0.19496228300000001, 0.195663083, 0.195927243, 0.19592895300000002), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.3315062522888184, timestamp=1713769140.2458656)	[('tile_y', [16, 32]), ('tile_x', [128, 4])],None,32


No: 13	GFLOPS: 3.74/25.32	result: MeasureResult(costs=(0.071259297, 0.071309438, 0.071467688, 0.071508398, 0.071808138, 0.07209517700000001, 0.07211611700000001, 0.072355447), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.0288457870483398, timestamp=1713769141.3955858)	[('tile_y', [16, 32]), ('tile_x', [64, 8])],None,26


DEBUG:autotvm:No: 13	GFLOPS: 3.74/25.32	result: MeasureResult(costs=(0.071259297, 0.071309438, 0.071467688, 0.071508398, 0.071808138, 0.07209517700000001, 0.07211611700000001, 0.072355447), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.0288457870483398, timestamp=1713769141.3955858)	[('tile_y', [16, 32]), ('tile_x', [64, 8])],None,26


No: 14	GFLOPS: 26.76/26.76	result: MeasureResult(costs=(0.00995924, 0.00996579, 0.00996761, 0.010003489999999999, 0.01004098, 0.010057499999999999, 0.010115039, 0.010129219), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.3440728187561035, timestamp=1713769141.7338915)	[('tile_y', [128, 4]), ('tile_x', [8, 64])],None,11


DEBUG:autotvm:No: 14	GFLOPS: 26.76/26.76	result: MeasureResult(costs=(0.00995924, 0.00996579, 0.00996761, 0.010003489999999999, 0.01004098, 0.010057499999999999, 0.010115039, 0.010129219), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.3440728187561035, timestamp=1713769141.7338915)	[('tile_y', [128, 4]), ('tile_x', [8, 64])],None,11


No: 15	GFLOPS: 10.86/26.76	result: MeasureResult(costs=(0.02428446, 0.024579639, 0.024706278999999998, 0.02472191, 0.024819169, 0.024856159000000003, 0.024874669000000002, 0.02498802), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.531752347946167, timestamp=1713769142.3865693)	[('tile_y', [8, 64]), ('tile_x', [16, 32])],None,13


DEBUG:autotvm:No: 15	GFLOPS: 10.86/26.76	result: MeasureResult(costs=(0.02428446, 0.024579639, 0.024706278999999998, 0.02472191, 0.024819169, 0.024856159000000003, 0.024874669000000002, 0.02498802), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.531752347946167, timestamp=1713769142.3865693)	[('tile_y', [8, 64]), ('tile_x', [16, 32])],None,13


No: 16	GFLOPS: 23.60/26.76	result: MeasureResult(costs=(0.011320449, 0.011346049, 0.011365079, 0.011367589, 0.011386599, 0.011386990000000001, 0.01140592, 0.011407419), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.38237714767456055, timestamp=1713769142.7419481)	[('tile_y', [16, 32]), ('tile_x', [8, 64])],None,8


DEBUG:autotvm:No: 16	GFLOPS: 23.60/26.76	result: MeasureResult(costs=(0.011320449, 0.011346049, 0.011365079, 0.011367589, 0.011386599, 0.011386990000000001, 0.01140592, 0.011407419), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.38237714767456055, timestamp=1713769142.7419481)	[('tile_y', [16, 32]), ('tile_x', [8, 64])],None,8


Finish loading 21 records


DEBUG:autotvm:Finish loading 21 records


Cannot find config for target=llvm -keys=cpu -mcpu=core-avx2, workload=('matmul_w_filters', 512, 512, 512, 'float32'). A fallback configuration is used, which may bring great performance regression.




func 9.6948807


In [19]:
@autotvm.template("matmul_brute_force_reduced_")
def matmul_brute_force(N, L, M, candidates=None):
    A = te.placeholder((N, L), name="A", dtype="float32")
    B = te.placeholder((L, M), name="B", dtype="float32")

    k = te.reduce_axis((0, L), name="k")
    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
    s = te.create_schedule(C.op)

    # schedule
    y, x = s[C].op.axis
    k = s[C].op.reduce_axis[0]

    ##### define space begin #####
    cfg = autotvm.get_config()
    filter = lambda v: v.size[0] != 1 and v.size[1] != 1
    multi_filter = lambda e: 16 <= (e["tile_x"].size[1] + e["tile_y"].size[1]) < 128
    cfg.multi_filter(multi_filter)

    cfg.define_split("tile_y", y, num_outputs=2, filter=filter)
    cfg.define_split("tile_x", x, num_outputs=2, filter=filter)
    ##### define space end #####

    # schedule according to config
    yo, yi = cfg["tile_y"].apply(s, C, y)
    xo, xi = cfg["tile_x"].apply(s, C, x)

    s[C].reorder(yo, xo, k, yi, xi)

    return s, [A, B, C]

In [20]:
t2 = autotvm.task.create("matmul_brute_force_reduced_", args=(N, L, M), target="llvm")
for idx in range(t2.config_space.range_length):
    if t2.config_space.is_index_valid(idx):
        print(t2.config_space.get(idx))
to2 = {
    "log_filename": "matmul_brute_force_reduced_.log",
    "n_trial": None,
    "tuner": "xgb",
    "early_stopping": None,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(
            number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True
        ),
    ),
}

[('tile_y', [-1, 16]), ('tile_x', [-1, 2])],None,3
[('tile_y', [-1, 32]), ('tile_x', [-1, 2])],None,4
[('tile_y', [-1, 64]), ('tile_x', [-1, 2])],None,5
[('tile_y', [-1, 16]), ('tile_x', [-1, 4])],None,12
[('tile_y', [-1, 32]), ('tile_x', [-1, 4])],None,13
[('tile_y', [-1, 64]), ('tile_x', [-1, 4])],None,14
[('tile_y', [-1, 8]), ('tile_x', [-1, 8])],None,20
[('tile_y', [-1, 16]), ('tile_x', [-1, 8])],None,21
[('tile_y', [-1, 32]), ('tile_x', [-1, 8])],None,22
[('tile_y', [-1, 64]), ('tile_x', [-1, 8])],None,23
[('tile_y', [-1, 2]), ('tile_x', [-1, 16])],None,27
[('tile_y', [-1, 4]), ('tile_x', [-1, 16])],None,28
[('tile_y', [-1, 8]), ('tile_x', [-1, 16])],None,29
[('tile_y', [-1, 16]), ('tile_x', [-1, 16])],None,30
[('tile_y', [-1, 32]), ('tile_x', [-1, 16])],None,31
[('tile_y', [-1, 64]), ('tile_x', [-1, 16])],None,32
[('tile_y', [-1, 2]), ('tile_x', [-1, 32])],None,36
[('tile_y', [-1, 4]), ('tile_x', [-1, 32])],None,37
[('tile_y', [-1, 8]), ('tile_x', [-1, 32])],None,38
[('tile_y', [

In [17]:
print(t2.config_space)

ConfigSpace (len=33, range_length=100, space_map=
   0 tile_y: Split(policy=factors, product=512, num_outputs=2) len=10
   1 tile_x: Split(policy=factors, product=512, num_outputs=2) len=10
)


In [21]:

run_tuning([t2], **to2)

waiting for device...


DEBUG:autotvm:waiting for device...


device available


DEBUG:autotvm:device available


Get devices for measurement successfully!


INFO:autotvm:Get devices for measurement successfully!


No: 1	GFLOPS: 14.68/14.68	result: MeasureResult(costs=(0.017961479, 0.018055150000000002, 0.018095949, 0.018101959999999997, 0.018103049, 0.018345, 0.018654339, 0.018942678999999997), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4382205009460449, timestamp=1713769579.1245925)	[('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,46


DEBUG:autotvm:No: 1	GFLOPS: 14.68/14.68	result: MeasureResult(costs=(0.017961479, 0.018055150000000002, 0.018095949, 0.018101959999999997, 0.018103049, 0.018345, 0.018654339, 0.018942678999999997), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4382205009460449, timestamp=1713769579.1245925)	[('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,46


No: 2	GFLOPS: 1.51/14.68	result: MeasureResult(costs=(0.167923945, 0.174843794, 0.178048694, 0.17933022299999998, 0.180033894, 0.18062589299999998, 0.181354983, 0.184085743), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.2033944129943848, timestamp=1713769581.3103216)	[('tile_y', [-1, 64]), ('tile_x', [-1, 4])],None,14


DEBUG:autotvm:No: 2	GFLOPS: 1.51/14.68	result: MeasureResult(costs=(0.167923945, 0.174843794, 0.178048694, 0.17933022299999998, 0.180033894, 0.18062589299999998, 0.181354983, 0.184085743), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.2033944129943848, timestamp=1713769581.3103216)	[('tile_y', [-1, 64]), ('tile_x', [-1, 4])],None,14


No: 3	GFLOPS: 14.31/14.68	result: MeasureResult(costs=(0.018583589, 0.018656770000000003, 0.018657389, 0.018666989000000002, 0.01870732, 0.018725679, 0.018901159, 0.019204329000000003), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4477672576904297, timestamp=1713769581.9325662)	[('tile_y', [-1, 2]), ('tile_x', [-1, 64])],None,45


DEBUG:autotvm:No: 3	GFLOPS: 14.31/14.68	result: MeasureResult(costs=(0.018583589, 0.018656770000000003, 0.018657389, 0.018666989000000002, 0.01870732, 0.018725679, 0.018901159, 0.019204329000000003), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4477672576904297, timestamp=1713769581.9325662)	[('tile_y', [-1, 2]), ('tile_x', [-1, 64])],None,45


No: 4	GFLOPS: 14.64/14.68	result: MeasureResult(costs=(0.018006889, 0.018077509, 0.01818825, 0.018201699000000002, 0.01825489, 0.018324969, 0.018722179000000002, 0.018944639), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4967226982116699, timestamp=1713769582.3696012)	[('tile_y', [-1, 8]), ('tile_x', [-1, 32])],None,38


DEBUG:autotvm:No: 4	GFLOPS: 14.64/14.68	result: MeasureResult(costs=(0.018006889, 0.018077509, 0.01818825, 0.018201699000000002, 0.01825489, 0.018324969, 0.018722179000000002, 0.018944639), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4967226982116699, timestamp=1713769582.3696012)	[('tile_y', [-1, 8]), ('tile_x', [-1, 32])],None,38


No: 5	GFLOPS: 12.51/14.68	result: MeasureResult(costs=(0.020959909, 0.021081579000000003, 0.021123919, 0.021186529, 0.021287669000000002, 0.021450609, 0.02193695, 0.022702688999999998), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5964345932006836, timestamp=1713769583.1751173)	[('tile_y', [-1, 16]), ('tile_x', [-1, 32])],None,39


DEBUG:autotvm:No: 5	GFLOPS: 12.51/14.68	result: MeasureResult(costs=(0.020959909, 0.021081579000000003, 0.021123919, 0.021186529, 0.021287669000000002, 0.021450609, 0.02193695, 0.022702688999999998), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5964345932006836, timestamp=1713769583.1751173)	[('tile_y', [-1, 16]), ('tile_x', [-1, 32])],None,39


No: 6	GFLOPS: 11.14/14.68	result: MeasureResult(costs=(0.024044319, 0.024044668999999998, 0.02405891, 0.024074949000000002, 0.024093309, 0.024115899, 0.024153439, 0.024171379000000003), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7149598598480225, timestamp=1713769583.672841)	[('tile_y', [-1, 16]), ('tile_x', [-1, 64])],None,48


DEBUG:autotvm:No: 6	GFLOPS: 11.14/14.68	result: MeasureResult(costs=(0.024044319, 0.024044668999999998, 0.02405891, 0.024074949000000002, 0.024093309, 0.024115899, 0.024153439, 0.024171379000000003), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7149598598480225, timestamp=1713769583.672841)	[('tile_y', [-1, 16]), ('tile_x', [-1, 64])],None,48


No: 7	GFLOPS: 5.99/14.68	result: MeasureResult(costs=(0.043890279, 0.043986788, 0.044562477999999996, 0.044571978, 0.044971758, 0.045046918, 0.045301658, 0.046426037999999996), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7673447132110596, timestamp=1713769584.558283)	[('tile_y', [-1, 32]), ('tile_x', [-1, 16])],None,31


DEBUG:autotvm:No: 7	GFLOPS: 5.99/14.68	result: MeasureResult(costs=(0.043890279, 0.043986788, 0.044562477999999996, 0.044571978, 0.044971758, 0.045046918, 0.045301658, 0.046426037999999996), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7673447132110596, timestamp=1713769584.558283)	[('tile_y', [-1, 32]), ('tile_x', [-1, 16])],None,31


No: 8	GFLOPS: 18.29/18.29	result: MeasureResult(costs=(0.01410917, 0.014165089, 0.014251070000000001, 0.014325539, 0.014996129, 0.01516487, 0.01518056, 0.01519466), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.42378854751586914, timestamp=1713769584.9500198)	[('tile_y', [-1, 32]), ('tile_x', [-1, 32])],None,40


DEBUG:autotvm:No: 8	GFLOPS: 18.29/18.29	result: MeasureResult(costs=(0.01410917, 0.014165089, 0.014251070000000001, 0.014325539, 0.014996129, 0.01516487, 0.01518056, 0.01519466), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.42378854751586914, timestamp=1713769584.9500198)	[('tile_y', [-1, 32]), ('tile_x', [-1, 32])],None,40


No: 9	GFLOPS: 18.37/18.37	result: MeasureResult(costs=(0.014375219, 0.0143932, 0.01443409, 0.01445811, 0.014612919, 0.014654028999999999, 0.014661720000000001, 0.01533257), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4712357521057129, timestamp=1713769585.5385401)	[('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,41


DEBUG:autotvm:No: 9	GFLOPS: 18.37/18.37	result: MeasureResult(costs=(0.014375219, 0.0143932, 0.01443409, 0.01445811, 0.014612919, 0.014654028999999999, 0.014661720000000001, 0.01533257), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4712357521057129, timestamp=1713769585.5385401)	[('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,41


No: 10	GFLOPS: 3.59/18.37	result: MeasureResult(costs=(0.07305823700000001, 0.073103397, 0.07320178699999999, 0.073831328, 0.075395927, 0.075639948, 0.076361407, 0.078268637), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.0746045112609863, timestamp=1713769586.594055)	[('tile_y', [-1, 4]), ('tile_x', [-1, 16])],None,28


DEBUG:autotvm:No: 10	GFLOPS: 3.59/18.37	result: MeasureResult(costs=(0.07305823700000001, 0.073103397, 0.07320178699999999, 0.073831328, 0.075395927, 0.075639948, 0.076361407, 0.078268637), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.0746045112609863, timestamp=1713769586.594055)	[('tile_y', [-1, 4]), ('tile_x', [-1, 16])],None,28


No: 11	GFLOPS: 4.98/18.37	result: MeasureResult(costs=(0.053314888000000005, 0.053522608, 0.053549209, 0.053653938, 0.053877758, 0.054030158, 0.054038857999999995, 0.055559618), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9939346313476562, timestamp=1713769587.7693615)	[('tile_y', [-1, 64]), ('tile_x', [-1, 8])],None,23


DEBUG:autotvm:No: 11	GFLOPS: 4.98/18.37	result: MeasureResult(costs=(0.053314888000000005, 0.053522608, 0.053549209, 0.053653938, 0.053877758, 0.054030158, 0.054038857999999995, 0.055559618), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9939346313476562, timestamp=1713769587.7693615)	[('tile_y', [-1, 64]), ('tile_x', [-1, 8])],None,23


No: 12	GFLOPS: 22.01/22.01	result: MeasureResult(costs=(0.011660069, 0.011842209999999999, 0.011887309, 0.012147838999999999, 0.01220982, 0.012457519, 0.012564019999999999, 0.012779699), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.549339771270752, timestamp=1713769588.1319046)	[('tile_y', [-1, 32]), ('tile_x', [-1, 64])],None,49


DEBUG:autotvm:No: 12	GFLOPS: 22.01/22.01	result: MeasureResult(costs=(0.011660069, 0.011842209999999999, 0.011887309, 0.012147838999999999, 0.01220982, 0.012457519, 0.012564019999999999, 0.012779699), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.549339771270752, timestamp=1713769588.1319046)	[('tile_y', [-1, 32]), ('tile_x', [-1, 64])],None,49


No: 13	GFLOPS: 5.35/22.01	result: MeasureResult(costs=(0.049240768000000004, 0.049274199000000005, 0.049572648000000004, 0.049905218, 0.049957089, 0.049997308, 0.050580149, 0.052643648), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7824454307556152, timestamp=1713769589.0373187)	[('tile_y', [-1, 16]), ('tile_x', [-1, 8])],None,21


DEBUG:autotvm:No: 13	GFLOPS: 5.35/22.01	result: MeasureResult(costs=(0.049240768000000004, 0.049274199000000005, 0.049572648000000004, 0.049905218, 0.049957089, 0.049997308, 0.050580149, 0.052643648), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7824454307556152, timestamp=1713769589.0373187)	[('tile_y', [-1, 16]), ('tile_x', [-1, 8])],None,21


No: 14	GFLOPS: 1.48/22.01	result: MeasureResult(costs=(0.170464414, 0.178147254, 0.180753374, 0.181050353, 0.181118243, 0.185464833, 0.185544613, 0.185680154), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.219104290008545, timestamp=1713769591.2541928)	[('tile_y', [-1, 32]), ('tile_x', [-1, 4])],None,13


DEBUG:autotvm:No: 14	GFLOPS: 1.48/22.01	result: MeasureResult(costs=(0.170464414, 0.178147254, 0.180753374, 0.181050353, 0.181118243, 0.185464833, 0.185544613, 0.185680154), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.219104290008545, timestamp=1713769591.2541928)	[('tile_y', [-1, 32]), ('tile_x', [-1, 4])],None,13


No: 15	GFLOPS: 1.69/22.01	result: MeasureResult(costs=(0.158271604, 0.15834435400000002, 0.158392685, 0.158409334, 0.158455154, 0.15871542500000002, 0.159101364, 0.159150514), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.9915528297424316, timestamp=1713769593.375755)	[('tile_y', [-1, 16]), ('tile_x', [-1, 2])],None,3


DEBUG:autotvm:No: 15	GFLOPS: 1.69/22.01	result: MeasureResult(costs=(0.158271604, 0.15834435400000002, 0.158392685, 0.158409334, 0.158455154, 0.15871542500000002, 0.159101364, 0.159150514), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.9915528297424316, timestamp=1713769593.375755)	[('tile_y', [-1, 16]), ('tile_x', [-1, 2])],None,3


No: 16	GFLOPS: 6.17/22.01	result: MeasureResult(costs=(0.043087618, 0.043137648, 0.043260648000000006, 0.043320228, 0.043547158, 0.043679319, 0.043931848, 0.044088479), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7365243434906006, timestamp=1713769594.0876272)	[('tile_y', [-1, 16]), ('tile_x', [-1, 16])],None,30


DEBUG:autotvm:No: 16	GFLOPS: 6.17/22.01	result: MeasureResult(costs=(0.043087618, 0.043137648, 0.043260648000000006, 0.043320228, 0.043547158, 0.043679319, 0.043931848, 0.044088479), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7365243434906006, timestamp=1713769594.0876272)	[('tile_y', [-1, 16]), ('tile_x', [-1, 16])],None,30


No: 17	GFLOPS: 0.69/22.01	result: MeasureResult(costs=(0.356290887, 0.356833058, 0.356903757, 0.35765572700000003, 0.358384207, 0.363731927, 0.450162754, 0.49592008299999996), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.581121921539307, timestamp=1713769598.8071964)	[('tile_y', [-1, 32]), ('tile_x', [-1, 2])],None,4


DEBUG:autotvm:No: 17	GFLOPS: 0.69/22.01	result: MeasureResult(costs=(0.356290887, 0.356833058, 0.356903757, 0.35765572700000003, 0.358384207, 0.363731927, 0.450162754, 0.49592008299999996), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.581121921539307, timestamp=1713769598.8071964)	[('tile_y', [-1, 32]), ('tile_x', [-1, 2])],None,4


No: 18	GFLOPS: 4.34/22.01	result: MeasureResult(costs=(0.060904868, 0.060948798, 0.061187008, 0.061588247000000006, 0.061744748, 0.062770258, 0.062784398, 0.063196907), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9425573348999023, timestamp=1713769599.7317686)	[('tile_y', [-1, 2]), ('tile_x', [-1, 16])],None,27


DEBUG:autotvm:No: 18	GFLOPS: 4.34/22.01	result: MeasureResult(costs=(0.060904868, 0.060948798, 0.061187008, 0.061588247000000006, 0.061744748, 0.062770258, 0.062784398, 0.063196907), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9425573348999023, timestamp=1713769599.7317686)	[('tile_y', [-1, 2]), ('tile_x', [-1, 16])],None,27


No: 19	GFLOPS: 5.80/22.01	result: MeasureResult(costs=(0.045752838, 0.045901718, 0.046017029, 0.046019738, 0.046265428, 0.046329169, 0.046380898999999996, 0.047783628), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7387423515319824, timestamp=1713769600.5904732)	[('tile_y', [-1, 8]), ('tile_x', [-1, 8])],None,20


DEBUG:autotvm:No: 19	GFLOPS: 5.80/22.01	result: MeasureResult(costs=(0.045752838, 0.045901718, 0.046017029, 0.046019738, 0.046265428, 0.046329169, 0.046380898999999996, 0.047783628), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7387423515319824, timestamp=1713769600.5904732)	[('tile_y', [-1, 8]), ('tile_x', [-1, 8])],None,20


No: 20	GFLOPS: 3.56/22.01	result: MeasureResult(costs=(0.074566047, 0.074574438, 0.07487437799999999, 0.075065288, 0.075382767, 0.075628057, 0.076287227, 0.077466577), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.0557057857513428, timestamp=1713769601.6525898)	[('tile_y', [-1, 16]), ('tile_x', [-1, 4])],None,12


DEBUG:autotvm:No: 20	GFLOPS: 3.56/22.01	result: MeasureResult(costs=(0.074566047, 0.074574438, 0.07487437799999999, 0.075065288, 0.075382767, 0.075628057, 0.076287227, 0.077466577), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.0557057857513428, timestamp=1713769601.6525898)	[('tile_y', [-1, 16]), ('tile_x', [-1, 4])],None,12


No: 21	GFLOPS: 5.62/22.01	result: MeasureResult(costs=(0.047089938, 0.047164528, 0.047417529, 0.047541868, 0.047638938000000006, 0.048282218, 0.048405138, 0.048616678999999996), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.831904411315918, timestamp=1713769602.6143882)	[('tile_y', [-1, 64]), ('tile_x', [-1, 16])],None,32


DEBUG:autotvm:No: 21	GFLOPS: 5.62/22.01	result: MeasureResult(costs=(0.047089938, 0.047164528, 0.047417529, 0.047541868, 0.047638938000000006, 0.048282218, 0.048405138, 0.048616678999999996), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.831904411315918, timestamp=1713769602.6143882)	[('tile_y', [-1, 64]), ('tile_x', [-1, 16])],None,32


No: 22	GFLOPS: 15.17/22.01	result: MeasureResult(costs=(0.0175527, 0.017579979000000003, 0.017683549, 0.01771011, 0.017711539, 0.017716489, 0.01772243, 0.01791553), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5103371143341064, timestamp=1713769603.0455701)	[('tile_y', [-1, 8]), ('tile_x', [-1, 64])],None,47


DEBUG:autotvm:No: 22	GFLOPS: 15.17/22.01	result: MeasureResult(costs=(0.0175527, 0.017579979000000003, 0.017683549, 0.01771011, 0.017711539, 0.017716489, 0.01772243, 0.01791553), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5103371143341064, timestamp=1713769603.0455701)	[('tile_y', [-1, 8]), ('tile_x', [-1, 64])],None,47


No: 23	GFLOPS: 14.66/22.01	result: MeasureResult(costs=(0.018050009, 0.018053459, 0.018073749, 0.018260709, 0.018273749, 0.018356359000000003, 0.0183933, 0.019062069), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.42687368392944336, timestamp=1713769603.6078403)	[('tile_y', [-1, 2]), ('tile_x', [-1, 32])],None,36


DEBUG:autotvm:No: 23	GFLOPS: 14.66/22.01	result: MeasureResult(costs=(0.018050009, 0.018053459, 0.018073749, 0.018260709, 0.018273749, 0.018356359000000003, 0.0183933, 0.019062069), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.42687368392944336, timestamp=1713769603.6078403)	[('tile_y', [-1, 2]), ('tile_x', [-1, 32])],None,36


No: 24	GFLOPS: 5.24/22.01	result: MeasureResult(costs=(0.050037239, 0.050058388, 0.050253948, 0.050284618, 0.050566818, 0.052433988, 0.053218018, 0.053336787000000004), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9091489315032959, timestamp=1713769604.5054107)	[('tile_y', [-1, 32]), ('tile_x', [-1, 8])],None,22


DEBUG:autotvm:No: 24	GFLOPS: 5.24/22.01	result: MeasureResult(costs=(0.050037239, 0.050058388, 0.050253948, 0.050284618, 0.050566818, 0.052433988, 0.053218018, 0.053336787000000004), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.9091489315032959, timestamp=1713769604.5054107)	[('tile_y', [-1, 32]), ('tile_x', [-1, 8])],None,22


No: 25	GFLOPS: 14.81/22.01	result: MeasureResult(costs=(0.017960889, 0.01798026, 0.017995629000000003, 0.018074989, 0.018155018999999998, 0.018163669, 0.01819938, 0.018480989), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4337656497955322, timestamp=1713769605.0771787)	[('tile_y', [-1, 4]), ('tile_x', [-1, 32])],None,37


DEBUG:autotvm:No: 25	GFLOPS: 14.81/22.01	result: MeasureResult(costs=(0.017960889, 0.01798026, 0.017995629000000003, 0.018074989, 0.018155018999999998, 0.018163669, 0.01819938, 0.018480989), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.4337656497955322, timestamp=1713769605.0771787)	[('tile_y', [-1, 4]), ('tile_x', [-1, 32])],None,37


No: 26	GFLOPS: 6.11/22.01	result: MeasureResult(costs=(0.043686748000000004, 0.043818538, 0.043882579, 0.043890509, 0.044000238000000004, 0.044026569, 0.044054948000000003, 0.044083849), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7779927253723145, timestamp=1713769605.8342779)	[('tile_y', [-1, 8]), ('tile_x', [-1, 16])],None,29


DEBUG:autotvm:No: 26	GFLOPS: 6.11/22.01	result: MeasureResult(costs=(0.043686748000000004, 0.043818538, 0.043882579, 0.043890509, 0.044000238000000004, 0.044026569, 0.044054948000000003, 0.044083849), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.7779927253723145, timestamp=1713769605.8342779)	[('tile_y', [-1, 8]), ('tile_x', [-1, 16])],None,29


No: 27	GFLOPS: 0.63/22.01	result: MeasureResult(costs=(0.349120528, 0.356044087, 0.369796607, 0.43288926400000005, 0.458574844, 0.471594323, 0.47427137399999997, 0.475638714), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.8864734172821045, timestamp=1713769610.8383458)	[('tile_y', [-1, 64]), ('tile_x', [-1, 2])],None,5


DEBUG:autotvm:No: 27	GFLOPS: 0.63/22.01	result: MeasureResult(costs=(0.349120528, 0.356044087, 0.369796607, 0.43288926400000005, 0.458574844, 0.471594323, 0.47427137399999997, 0.475638714), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.8864734172821045, timestamp=1713769610.8383458)	[('tile_y', [-1, 64]), ('tile_x', [-1, 2])],None,5


In [22]:
# apply history best from log file
with autotvm.apply_history_best("matmul_brute_force_reduced_.log"):
    with tvm.target.Target("llvm"):
        s, arg_bufs = matmul_brute_force(N, L, M, "float32")
        func = tvm.build(s, arg_bufs)
dev = tvm.cpu(0)

# check correctness
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = np.matmul(a_np, b_np).astype(np.float32)

c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=np.float32), dev)
func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)

np.testing.assert_allclose(c_tvm.numpy(), c_np, rtol=1e-4)


time_f = func.time_evaluator(func.entry_name, dev, number=10)
cost = time_f(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm).mean
print("func", cost*1000)

Finish loading 27 records


DEBUG:autotvm:Finish loading 27 records


Cannot find config for target=llvm -keys=cpu , workload=('matmul_brute_force_reduced_', 512, 512, 512, 'float32'). A fallback configuration is used, which may bring great performance regression.




func 159.2494124
