In [10]:
import TensorFrost as tf
import numpy as np
import matplotlib.pyplot as plt
import time

tf.initialize(tf.cpu)

def matmul():
    A = tf.input([-1, -1], tf.float32)
    N, M = A.shape
    B = tf.input([-1,  M], tf.float32)
    K = B.shape[1]

    C = (tf.sin(A) @ tf.cos(B).T)**2.0

    return [C]

mmul = tf.compile(matmul)

matmul:
  Kernel count: 1
  Intermediate buffers: 0
  Host readbacks: 0
  Host writes: 0
  Lines of generated code: 464
  IR Compile time: 0.483100 ms
  Compiler time: 1782.470947 ms



In [11]:
all_kernels = tf.get_all_generated_kernels()
print("Generated kernels:")
for k in all_kernels:
    print(k)

Generated kernels:

extern "C" __declspec(dllexport) void kernel_0(uint* var, uint* off, uint* mem, uint work_group_count)
{
  #pragma omp parallel for
  for (int block_id = 0; block_id < work_group_count; block_id++)
  {
    for (int block_thread_id1 = 0; block_thread_id1 < 16; block_thread_id1++)
    for (int block_thread_id0 = 0; block_thread_id0 < 16; block_thread_id0++)
    {
      int v2_0 = block_id;
      int in_block_index_0 = block_thread_id1;
      int in_block_index_1 = block_thread_id0;
      int v2_1 = asint(var[0]) + 16;
      int v2_3 = v2_1 - 1;
      int blocks_shape_1 = v2_3 / 16;
      int out_block_index_0 = v2_0 / blocks_shape_1;
      int v2_4 = out_block_index_0 * blocks_shape_1;
      int out_block_index_1 = v2_0 - v2_4;
      int v2_5 = out_block_index_0 * 16;
      int dim_index_0 = v2_5 + in_block_index_0;
      int v2_6 = out_block_index_1 * 16;
      int dim_index_1 = v2_6 + in_block_index_1;
      bool v2_7 = dim_index_0 < asint(var[1]);
      bool v2_8 =

In [13]:
Anp = np.random.rand(512, 512).astype(np.float32)
Bnp = np.random.rand(512, 512).astype(np.float32)
A = tf.tensor(Anp)
B = tf.tensor(Bnp)

start = time.time()
repeat = 10
for i in range(repeat):
    C, = mmul(A, B)
Cnp = C.numpy
tf_time = (time.time() - start) / repeat


#compare to numpy
start = time.time()
for i in range(repeat):
    Cnp2 = (np.sin(Anp) @ np.cos(Bnp).T)**2.0
np_time = (time.time() - start) / repeat

Cerror = np.linalg.norm(Cnp - Cnp2) / np.linalg.norm(Cnp2)
print("Error:", Cerror)
print("TF Time:", tf_time)
print("NP Time:", np_time)
print("Speedup:", np_time / tf_time)

tf_flops = 2 * Anp.shape[0] * Anp.shape[1] * Bnp.shape[1] / tf_time
print("TF GFLOPS:", tf_flops / 1e9)
np_flops = 2 * Anp.shape[0] * Anp.shape[1] * Bnp.shape[1] / np_time
print("NP GFLOPS:", np_flops / 1e9)

Error: 7.16898e-07
TF Time: 0.08479938507080079
NP Time: 0.0030983448028564452
Speedup: 0.036537349890798994
TF GFLOPS: 3.165535407784828
NP GFLOPS: 86.63834178575681
