In [1]:
%env TF_CPP_MIN_LOG_LEVEL = 3
import tensorflow as tf
from tensorflow import keras

env: TF_CPP_MIN_LOG_LEVEL=3


# Array Tests:
1. (10, 10) Pure Matmul: 52 µs ± 1.01 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
2. (1e3, 1e4) Pure Matmul: 1.88 ms ± 5.21 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Out of memory if higher.

In [2]:
# Pure Matmul:
a = tf.random.normal((10, 10))
%timeit a @ a

52 µs ± 1.01 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [3]:
a = tf.random.normal((int(1e3), int(1e4)))
%timeit a @ tf.transpose(a)

1.88 ms ± 5.21 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# Transformation Tests:
1. (1e2, 1e3) Jit matmul: 

282 µs ± 170 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

155 µs ± 1.49 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

In [2]:
@tf.function(jit_compile=True)
def matmul():
    a = tf.random.normal((int(1e2), int(1e3)))
    return a @ tf.transpose(a)
%timeit matmul()

222 µs ± 56.9 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
%timeit matmul()

167 µs ± 2.6 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [2]:
# Pure Autodiff:
a = tf.random.normal((100, 1000))
with tf.GradientTape(persistent=True) as tape:
    tape.watch(a)
    y = tf.matmul(a, tf.transpose(a))
    y = tf.reduce_sum(y)
%timeit tape.gradient(y, a)

446 µs ± 9.56 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [2]:
# Chained Autodiff:
a = tf.random.normal((100, 1000))
@tf.function(jit_compile=True)
def fun(a):
    with tf.GradientTape(persistent=True) as tape:
        tape.watch(a)
        y = tf.matmul(a, tf.transpose(a))
        y = tf.reduce_sum(y)
    return tape.gradient(y, a)
%timeit fun(a)

210 µs ± 4.13 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [3]:
%timeit fun(a)

203 µs ± 2.87 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
