In [1]:
import tensorflow as tf
import numpy as np

from library.array_ops import asymmetrical_vectored_lookup as lookup

In [2]:
def pretty_print_guess_tensor(const_guess, operand_guess, operator_guess):
    # TODO: const_guess

    s = []

    for t in operand_guess:
        s += [f'x_{tf.argmax(t)}']

    operator_lookup = ['+','-', '*','/']
    result = s[::]
    for i, op_one_hot in enumerate(operator_guess):
        operators = tf.argmax(op_one_hot,axis=-1)
        left = result[::2]
        right = (result[1:] + result[:1])[::2]
        ops = operators[:len(left)]
        result = []
        for l, op, r in zip(left, ops, right):
            result += [f'({l} {operator_lookup[op]} {r})']


    return ' '.join(result)

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)

pretty_print_guess_tensor(const_guess, operand_guess, operator_guess)

'(((x_0 + x_1) + (x_2 - x_3)) + ((x_4 * x_5) - (x_6 / x_7)))'

In [3]:
@tf.function
def to_prob_dist_all(v):
    v2 = tf.sqrt(tf.square(v)+1e-9)
    # v2 = tf.sqrt(tf.square(v))
    m = tf.expand_dims(tf.reduce_sum(v2, axis=-1),-1)
    n = tf.math.divide_no_nan(v2, m)
    return n

tf.print(tf.argmax(operator_guess))
tf.print(tf.argmax(to_prob_dist_all(operator_guess)))

[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]


In [4]:
@tf.function
def cross_entropy(x, y, epsilon = 1e-9):
    return -2 * tf.reduce_mean(y * tf.math.log(x + epsilon), -1) / tf.math.log(2.)

x = tf.constant([
    [1.0,0],
    [0.5,0.5],
    [.75,.25]
    ]
,dtype=tf.float32)

with tf.GradientTape() as tape:
    tape.watch(x)
    y = cross_entropy(x, x)

tf.print(y)
tf.print(tape.gradient(y, x))

[-0 1 0.811278105]
[[-1.44269502 29.8973541]
 [-0.442695022 -0.442695022]
 [-1.02765751 0.557305]]


In [5]:
@tf.function
def entropy(x):
    return cross_entropy(x, x)

In [6]:
@tf.function
def dot(x, y):
    r = tf.multiply(x, y)
    return tf.reduce_sum(r, -1)

x = tf.constant([
    [2,2,2],
    [3,3,3]
])

dot(x, x)

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([12, 27])>

In [7]:
@tf.function
@tf.custom_gradient
def asymmetrical_vectored_lookup(v, k):
    k_shape = tf.shape(k)
    v_shape = tf.shape(v)

    tf.debugging.assert_equal(k_shape, v_shape)

    # Pick the value at the most likely index, non-differentiably
    flat_k = tf.reshape(k, [-1, k_shape[-1]])
    collapsed_k = tf.argmax(flat_k, -1)
    collapsed_k = tf.one_hot(collapsed_k, k_shape[-1])
    unflat_k = tf.reshape(collapsed_k, k_shape)
    forward_result = dot(v, unflat_k)

    def grad(upstream_grads):
        # Estimate the target scalar which we want to look up
        target = forward_result - upstream_grads
        target = tf.expand_dims(target, -1)

        # Find the index of element in the array which is closest to target
        diff_vector = tf.math.squared_difference(v, target)
        d_idx = tf.argmin(diff_vector, axis=-1)

        # Create a vector which is 1 everywhere except the idx
        # of the target, where it is -1
        ones = tf.ones(k_shape)
        eyes = tf.one_hot([d_idx], k_shape[-1])[0]
        k_grad = -(2 * eyes - ones)

        # d/dv (v . k) = k
        v_grad = k

        upstream_grads = tf.expand_dims(upstream_grads, -1)
        return upstream_grads * v_grad, tf.math.abs(upstream_grads) * k_grad

    return forward_result, grad

v = tf.constant([
    [[ 1.,  2.,  3.], [ 4.,  5.,  6.]],
    [[ 7.,  8.,  9.], [11., 22., 33.]],
]
,dtype=tf.float32)

k = tf.constant([
    [[1., 0., 0.], [0., 1., 0.]],
    [[1., 1., 0.], [0., 0., 1.]]
],dtype=tf.float32)

target = tf.constant([
    [2, 4],
    [7, 34],
],dtype=tf.float32)

with tf.GradientTape(persistent=True) as tape:
    tape.watch(k)
    tape.watch(v)
    result = asymmetrical_vectored_lookup(v, k)
    loss = tf.nn.l2_loss(result - target)

tf.print(result)
tf.print(tape.gradient(loss, k))
# tf.print(tape.gradient(loss, v))

[[1 5]
 [7 33]]
[[[1 -1 1]
  [-1 1 1]]

 [[0 0 0]
  [1 1 -1]]]


In [8]:
# @tf.function
# def resolve_values(const_guess, values, operand_guess):
#     # TODO: const_guess

#     operand_guess = to_prob_dist_all(operand_guess)

#     operand_count = tf.shape(operand_guess)[0]
#     values = tf.expand_dims(values, axis=0)
#     values = tf.tile(values, [operand_count,1])

#     result = asymmetrical_vectored_lookup(values, operand_guess)

#     return result

# v1 = tf.range(NUM_LEAVES)
# cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
# const_guess = tf.concat([cgv, cgv],axis=1)
# # operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
# operand_guess = tf.constant([
#        [ 0,  0,  0,  0,  0, 1.0, 0, 0],
#        [ 0,  0,  0,  0,  0, 1.0, 0, 0],
#        [ 0,  0,  0,  0,  0, 1.0, 0, 0],
#        [ 0,  0,  0,  0,  0, 1.0, 0, 0],
#        [ 0,  0,  0,  0,  0, 1.0, 0, 0],
#        [ 0,  0,  0,  0,  0, 1.0, 0, 0],
#        [ 0,  0,  0,  0,  0, 1.0, 0, 0],
#        [ 0,  0,  0,  0,  0, 1.0, 0, 0],
# ])
# operand_guess = tf.Variable(operand_guess)
# # operand_guess = tf.Variable([
# #     0,0.5,0.5,0, 0,0,0,0,
# # ],dtype=tf.float32)
# values = tf.cast(v1,dtype=tf.float32) + 1.0

# target = tf.ones((NUM_LEAVES,), dtype=tf.float32) * 2.0

# # opt = tf.keras.optimizers.Adam(3e-4)
# resolve_values(const_guess, values, operand_guess)

In [9]:
@tf.function
def resolve_values(const_guess, values, operand_guess):
    # TODO: const_guess

    # tf.debugging.assert_rank(const_guess, 3)
    tf.debugging.assert_rank(values, 3) # [outer_batch, inner_batch, VALUES_SIZE]
    tf.debugging.assert_rank(operand_guess, 3) # [outer_batch, LEAVES_SIZE, VALUES_SIZE]

    values_shape = tf.shape(values)
    operands_shape = tf.shape(operand_guess)

    outer_batch, inner_batch, VALUES_SIZE = [values_shape[0], values_shape[1], values_shape[2]]
    outer_batch, LEAVES_SIZE, VALUES_SIZE = [operands_shape[0], operands_shape[1], operands_shape[2]]

    # Broadcast the operand choices
    operand_guess = tf.expand_dims(operand_guess, axis=1)
    operand_guess = tf.tile(operand_guess, [1,inner_batch,1,1]) # [outer_batch, inner_batch, LEAVES_SIZE, VALUES_SIZE]

    # Broadcast the values
    values = tf.expand_dims(values, axis=2)
    values = tf.tile(values, [1, 1, LEAVES_SIZE, 1]) # [outer_batch, inner_batch, LEAVES_SIZE, VALUES_SIZE]

    # Dot product
    # operand_guess = to_prob_dist_all(operand_guess)
    # result = dot(values, operand_guess) # [outer_batch, inner_batch, LEAVES_SIZE]
    result = asymmetrical_vectored_lookup(values, operand_guess)

    return result

# v1 = tf.range(NUM_LEAVES)
# cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
# const_guess = tf.concat([cgv, cgv],axis=1)

operand_guess = tf.Variable([
    [[1,0,0], [0,1,0], [0,0,1]],
    [[1,1,0], [0,0,1], [1,1,1]],
],dtype=tf.float32)

values = tf.constant([
    [[1,2,3],
    [4,5,6]],
    [[7,8,9],
    [11,22,33]],
], dtype=tf.float32)

with tf.GradientTape() as tape:
    result = resolve_values(const_guess, values, operand_guess)

grads = tape.gradient(result, operand_guess)

tf.print(tf.round(result))
tf.print(grads)

[[[1 2 3]
  [4 5 6]]

 [[7 9 7]
  [11 33 11]]]
[[[-2 2 2]
  [-2 2 2]
  [2 -2 2]]

 [[-2 2 2]
  [2 0 0]
  [-2 2 2]]]


In [10]:
operand_guess = tf.Variable([
    [[0,1,0], [0,1,0], [0,1,0]],
    [[0,1,0], [0,1,0], [0,1,0]],
],dtype=tf.float32)

values = tf.constant([
    [[1,2,3], [4,5,6]],
    [[7,8,9], [11,22,33]],
], dtype=tf.float32)

target = tf.constant([
    [[3, 2, 1], [6, 5, 4]],
    [[7, 9, 7], [11, 33, 11]],
],dtype=tf.float32)

operand_guess = tf.Variable(operand_guess)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    1e-1,
    decay_steps=100,
    decay_rate=1e-1,
    staircase=True)
opt = tf.keras.optimizers.Adam(lr_schedule)

steps = 10

for i in range(steps):
    with tf.GradientTape() as tape:
        resolved = resolve_values(const_guess, values, operand_guess)

        target_loss = tf.nn.l2_loss(resolved - target)
        # entropy_loss = entropy(operand_guess)

        loss = target_loss # + entropy_loss * 0
    
    variables = [operand_guess]
    grads = tape.gradient(loss, variables)
    opt.apply_gradients(zip(grads, variables))

    operand_guess.assign(to_prob_dist_all(operand_guess))

    if i % (steps // 10) == 0:
        # dist = tf.round(operand_guess * 100)
        tf.print(loss)

tf.round(operand_guess * 100)

185
185
185
185
185
0
0
0
0
0


<tf.Tensor: shape=(2, 3, 3), dtype=float32, numpy=
array([[[  0.,  12.,  88.],
        [  0., 100.,   0.],
        [ 88.,  12.,   0.]],

       [[ 88.,  12.,   0.],
        [  0.,  12.,  88.],
        [ 88.,  12.,   0.]]], dtype=float32)>

In [11]:
@tf.function
def operate(operands, operators):
    tf.debugging.assert_rank(operands, 3, 'Expected operands to be rank 3. [equation, datapoint, level]')
    tf.debugging.assert_rank(operators, 3, 'Expected operators to be rank 3. [equation, op_pair, op_type_one_hot]')

    opd_shape = tf.shape(operands)
    tf.debugging.assert_equal(opd_shape[-1] % 2, 0, 'Shape of axis -1 of operands must be div by 2')

    left = operands[:, :, ::2]
    right = tf.roll(operands, shift=-1, axis=-1)[:, :, ::2]

    r_add = left + right
    r_sub = left - right
    r_mul = left * right
    r_div = tf.math.divide_no_nan(left, right)

    r = tf.stack([r_add, r_sub, r_mul, r_div], axis=-1) # [equation, datapoint, op_pair, op_type_one_hot]

    opt = tf.expand_dims(operators, axis=1)
    opt = tf.tile(opt, [1, opd_shape[1], 1, 1]) # [equation, datapoint, op_pair, op_type_one_hot]

    # operators = tf.nn.softmax(operators, axis=-1)
    # operators = to_prob_dist_all(operators)

    result = asymmetrical_vectored_lookup(r, opt) # [equation, datapoint, op_pair]

    return result

operands = tf.constant([
    [[1, 2, 3, 4], [5, 6, 7, 8]],
    [[11, 22, 33, 44], [55, 66, 77, 88]],
],dtype=tf.float32)

add = [1,0,0,0]
sub = [0,1,0,0]
mul = [0,0,1,0]
div = [0,0,0,1]

operators = tf.constant([
    [add, sub],
    [mul, div]
],dtype=tf.float32)

operands = tf.Variable(operands)
operators = tf.Variable(operators)
target = tf.constant([
    [[1 + 2, 3 - 4], [5 + 6, 7 - 8]],
    [[11 * 22, 33 / 44], [55 * 66, 77 / 88]],
],dtype=tf.float32)

with tf.GradientTape() as tape:
    result = operate(operands, operators)
    loss = tf.nn.l2_loss(result - target)

tf.print(result)
tape.gradient(loss, operators)

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[ -2.5,   2.5,   2.5,   2.5],
       [ -5. ,   5. ,   5. ,   5. ],
       [ 31. ,  31. , -31. ,  31. ],
       [ -0. ,   0. ,   0. ,   0. ]], dtype=float32)>

In [12]:
operands = tf.constant([
    [[1, 2, 3, 4], [5, 6, 7, 8]],
    [[11, 22, 33, 44], [55, 66, 77, 88]],
],dtype=tf.float32)

add = [1,0,0,0]
sub = [1,0,0,0]
mul = [1,0,0,0]
div = [1,0,0,0]

operators = tf.constant([
    [add, sub],
    [mul, div]
],dtype=tf.float32)

operands = tf.Variable(operands)
operators = tf.Variable(operators)
target = tf.constant([
    [[1 + 2, 3 - 4], [5 + 6, 7 - 8]],
    [[11 * 22, 33 / 44], [55 * 66, 77 / 88]],
],dtype=tf.float32)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    1e-1,
    decay_steps=100,
    decay_rate=1e-1,
    staircase=True)
opt = tf.keras.optimizers.Adam(lr_schedule)

steps = 10
for i in range(steps):
    with tf.GradientTape() as tape:
        result = operate(operands, operators)
        target_loss = tf.nn.l2_loss(target - result)

        # entropy_loss = tf.reduce_sum(entropy(operators))

        loss = target_loss # + entropy_loss * 0.

    variables = [operators]
    grads = tape.gradient(loss, variables)

    opt.apply_gradients(zip(grads, variables))
    operators.assign(to_prob_dist_all(operators))

    if i % (steps // 10) == 0:
        tf.print(loss)

tf.print(tf.round(operators * 100))

667.882813 -0 [0.5 12 -1 15] [3 2 1 0]
0 0.00480556 [3 -1 30 0.875] [0 1 2 3]
0 0.0249008518 [3 -1 30 0.875] [0 1 2 3]
0 0.0337869972 [3 -1 30 0.875] [0 1 2 3]
0 0.0403830074 [3 -1 30 0.875] [0 1 2 3]
0 0.0458208323 [3 -1 30 0.875] [0 1 2 3]
0 0.0505302586 [3 -1 30 0.875] [0 1 2 3]
0 0.054729566 [3 -1 30 0.875] [0 1 2 3]
0 0.0585467 [3 -1 30 0.875] [0 1 2 3]
0 0.0620633774 [3 -1 30 0.875] [0 1 2 3]


In [13]:
def eager_process_block(operands, operators_arr):
    acc = operands

    for operators in operators_arr:
        num_operands = tf.shape(acc)[0]
        operators = operators[:num_operands // 2]
        acc = operate(acc, operators)

    return acc

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)
values = tf.cast(v1,dtype=tf.float32)
operands = resolve_values(const_guess, values, operand_guess)

with tf.GradientTape(persistent=True) as tape:
    tape.watch(operands)
    tape.watch(operator_guess)
    result = eager_process_block(operands, operator_guess)

tf.print(pretty_print_guess_tensor(const_guess, operand_guess, operator_guess))
x_0, x_1, x_2, x_3, x_4, x_5, x_6, x_7 = list(range(8))
tf.print((((x_0 + x_1) + (x_2 - x_3)) + ((x_4 * x_5) - (x_6 / x_7))))
tf.print(result)
tf.print(tf.reshape(tape.gradient(result, operands),(2,4)))
tf.print(tape.gradient(result, operator_guess))

ValueError: in user code:

    <ipython-input-9-df11ca9b5f79>:6 resolve_values  *
        tf.debugging.assert_rank(values, 3) # [outer_batch, inner_batch, VALUES_SIZE]
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper  **
        return target(*args, **kwargs)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\check_ops.py:1122 assert_rank_v2
        return assert_rank(x=x, rank=rank, message=message, name=name)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\check_ops.py:1181 assert_rank
        (message, name, e.args[2], e.args[1], x.get_shape()))

    ValueError: .  Tensor values:0 must have rank 3.  Received rank 1, shape (8,)


In [14]:
@tf.function
def unrolled_process_block_3(operands, operators_arr):
    acc = operands

    # Level 1
    operators = operators_arr[0]
    operators = operators[:4]
    acc = operate(acc, operators)

    # Level 2
    operators = operators_arr[1]
    operators = operators[:2]
    acc = operate(acc, operators)

    # Level 3
    operators = operators_arr[2]
    operators = operators[:1]
    acc = operate(acc, operators)

    return acc

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)
values = tf.cast(v1,dtype=tf.float32)
operands = resolve_values(const_guess, values, operand_guess)

with tf.GradientTape(persistent=True) as tape:
    tape.watch(operands)
    tape.watch(operator_guess)
    result = unrolled_process_block_3(operands, operator_guess)

tf.print(pretty_print_guess_tensor(const_guess, operand_guess, operator_guess))
x_0, x_1, x_2, x_3, x_4, x_5, x_6, x_7 = list(range(8))
tf.print((((x_0 + x_1) + (x_2 - x_3)) + ((x_4 * x_5) - (x_6 / x_7))))
tf.print(result)
tf.print(tf.reshape(tape.gradient(result, operands),(2,4)))
tf.print(tape.gradient(result, operator_guess))

ValueError: in user code:

    <ipython-input-9-df11ca9b5f79>:6 resolve_values  *
        tf.debugging.assert_rank(values, 3) # [outer_batch, inner_batch, VALUES_SIZE]
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper  **
        return target(*args, **kwargs)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\check_ops.py:1122 assert_rank_v2
        return assert_rank(x=x, rank=rank, message=message, name=name)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\check_ops.py:1181 assert_rank
        (message, name, e.args[2], e.args[1], x.get_shape()))

    ValueError: .  Tensor values:0 must have rank 3.  Received rank 1, shape (8,)


In [15]:
def print_collapsed_result(values, const_guess, operand_guess, operator_guess):
    # TODO: const_guess

    operands = tf.round(operand_guess)
    acc = resolve_values(const_guess, values, operands)
    operators = tf.round(operator_guess)

    result = eager_process_block(acc, operators)

    return result

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)

print_collapsed_result(values, const_guess, operand_guess, operator_guess)

ValueError: in user code:

    <ipython-input-9-df11ca9b5f79>:6 resolve_values  *
        tf.debugging.assert_rank(values, 3) # [outer_batch, inner_batch, VALUES_SIZE]
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper  **
        return target(*args, **kwargs)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\check_ops.py:1122 assert_rank_v2
        return assert_rank(x=x, rank=rank, message=message, name=name)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\check_ops.py:1181 assert_rank
        (message, name, e.args[2], e.args[1], x.get_shape()))

    ValueError: .  Tensor values:0 must have rank 3.  Received rank 1, shape (8,)


In [16]:
def bind_opt_train_step(opt, entropy_weight=1e+2):
    @tf.function
    def train_step(const_guess, operand_guess, operator_guess, values, target):
        with tf.GradientTape() as tape:
            cg, opg, otg = const_guess, operand_guess, operator_guess
            # cg = tf.nn.softmax(cg)
            # opg = tf.nn.softmax(opg)
            # otg = tf.nn.softmax(otg)

            cg = to_prob_dist_all(cg)
            opg = to_prob_dist_all(opg)
            otg = to_prob_dist_all(otg)

            cg_entropy = 0.0 # TODO
            opg_entropy = tf.reduce_sum(entropy(opg))
            otg_entropy = tf.reduce_sum(entropy(otg))

            operands = resolve_values(cg, values, opg)
            result = unrolled_process_block_3(operands, otg)

            target_loss = tf.nn.l2_loss(result[0] - target)

            loss = target_loss

            if target_loss < 1:
                loss += entropy_weight * (opg_entropy + otg_entropy)

        variables = [operand_guess, operator_guess]
        grads = tape.gradient(loss, variables)
        # grads = [tf.clip_by_norm(g, 100.0) for g in grads]
        opt.apply_gradients(zip(grads, variables))

        const_guess.assign(to_prob_dist_all(const_guess))
        operand_guess.assign(to_prob_dist_all(operand_guess))
        operator_guess.assign(to_prob_dist_all(operator_guess))

        return loss, target_loss, cg_entropy, opg_entropy, otg_entropy

    return train_step

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)
values = tf.cast(v1,dtype=tf.float32)

const_guess = tf.Variable(const_guess)
operand_guess = tf.Variable(operand_guess)
operator_guess = tf.Variable(operator_guess)

target = 19.0

# opt = tf.keras.optimizers.Adam(3e-4)
# opt = tf.keras.optimizers.SGD(1e-1)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    1e-1,
    decay_steps=100,
    decay_rate=1e-1,
    staircase=True)
opt = tf.keras.optimizers.Adam(lr_schedule)
train_step = bind_opt_train_step(opt)

with tf.GradientTape(persistent=True) as tape:
    tape.watch(const_guess)
    tape.watch(operand_guess)
    tape.watch(operator_guess)

    result = train_step(const_guess, operand_guess, operator_guess, values, target)

tf.print(result)
tf.print(tape.gradient(result, operand_guess))
tf.print(tape.gradient(result, operator_guess))

ValueError: in user code:

    <ipython-input-16-242b5b2b2e1b>:18 train_step  *
        operands = resolve_values(cg, values, opg)
    <ipython-input-9-df11ca9b5f79>:6 resolve_values  *
        tf.debugging.assert_rank(values, 3) # [outer_batch, inner_batch, VALUES_SIZE]
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper  **
        return target(*args, **kwargs)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\check_ops.py:1122 assert_rank_v2
        return assert_rank(x=x, rank=rank, message=message, name=name)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\check_ops.py:1181 assert_rank
        (message, name, e.args[2], e.args[1], x.get_shape()))

    ValueError: .  Tensor values:0 must have rank 3.  Received rank 1, shape (8,)


In [17]:
# opt = tf.keras.optimizers.Adam(3e-4)
# opt = tf.keras.optimizers.Adam(1e-2)
# opt = tf.keras.optimizers.SGD(1e-1)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    1e-1,
    decay_steps=100,
    decay_rate=1e-1,
    staircase=True)
opt = tf.keras.optimizers.Adam(lr_schedule)
train_step = bind_opt_train_step(opt, 1)

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)
values = tf.cast(v1,dtype=tf.float32)

const_guess = tf.Variable(const_guess)
operand_guess = tf.Variable(operand_guess)
operator_guess = tf.Variable(operator_guess)

target = 7.0
steps = 1000
for i in range(steps):
    loss, target_loss, cg_entropy, opg_entropy, otg_entropy = train_step(const_guess, operand_guess, operator_guess, values, target)

    if i % (steps // 10) == 0:
        cg = const_guess.numpy()
        opg = operand_guess.numpy()
        otg = operator_guess.numpy()
        collapsed_result = print_collapsed_result(values, cg, opg, otg)

        tf.print(i, collapsed_result[0], loss, target_loss, cg_entropy, opg_entropy, otg_entropy)
        # tf.print(pretty_print_guess_tensor(cg, opg, otg))
tf.print(pretty_print_guess_tensor(cg, opg, otg))

ValueError: in user code:

    <ipython-input-16-242b5b2b2e1b>:18 train_step  *
        operands = resolve_values(cg, values, opg)
    <ipython-input-9-df11ca9b5f79>:6 resolve_values  *
        tf.debugging.assert_rank(values, 3) # [outer_batch, inner_batch, VALUES_SIZE]
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper  **
        return target(*args, **kwargs)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\check_ops.py:1122 assert_rank_v2
        return assert_rank(x=x, rank=rank, message=message, name=name)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\windows\AppData\Local\Programs\Python\Python37\lib\site-packages\tensorflow\python\ops\check_ops.py:1181 assert_rank
        (message, name, e.args[2], e.args[1], x.get_shape()))

    ValueError: .  Tensor values:0 must have rank 3.  Received rank 1, shape (8,)


In [18]:
tf.round(opg * 100)


NameError: name 'opg' is not defined

In [19]:
tf.round(entropy(opg)*100)

NameError: name 'opg' is not defined

In [42]:
tf.config.run_functions_eagerly(True)

In [59]:
@tf.function
def operate(operands, operators):
    tf.debugging.assert_rank(operands, 3, 'Expected operands to be rank 3. [equation, datapoint, level]')
    tf.debugging.assert_rank(operators, 3, 'Expected operators to be rank 3. [equation, op_pair, op_type_one_hot]')

    opd_shape = tf.shape(operands)
    tf.debugging.assert_equal(opd_shape[-1] % 2, 0, 'Shape of axis -1 of operands must be div by 2')

    left = operands[:, :, ::2]
    right = tf.roll(operands, shift=-1, axis=-1)[:, :, ::2]

    r_add = left + right
    r_sub = left - right
    r_mul = left * right
    r_div = tf.math.divide_no_nan(left, right)

    r = tf.stack([r_add, r_sub, r_mul, r_div], axis=-1) # [equation, datapoint, op_pair, op_type_one_hot]

    opt = tf.expand_dims(operators, axis=1)
    opt = tf.tile(opt, [1, opd_shape[1], 1, 1]) # [equation, datapoint, op_pair, op_type_one_hot]

    # operators = tf.nn.softmax(operators, axis=-1)
    # operators = to_prob_dist_all(operators)

    result = asymmetrical_vectored_lookup(r, opt) # [equation, datapoint, op_pair]

    return result

operands = tf.constant([
    [[1, 2, 3, 4], [5, 6, 7, 8]],
    [[11, 22, 33, 44], [55, 66, 77, 88]],
],dtype=tf.float32)

add = [1,0,0,0]
sub = [0,1,0,0]
mul = [0,0,1,0]
div = [0,0,0,1]

operators = tf.constant([
    [add, sub],
    [mul, div]
],dtype=tf.float32)

operands = tf.Variable(operands)
operators = tf.Variable(operators)
target = tf.constant([
    [[1 + 2, 3 - 4], [5 + 6, 7 - 8]],
    [[11 * 22, 33 / 44], [55 * 66, 77 / 88]],
],dtype=tf.float32)

with tf.GradientTape() as tape:
    result = operate(operands, operators)
    loss = tf.nn.l2_loss(result - target)

tf.print(result)
tape.gradient(loss, operators)

[[[3 -1]
  [11 -1]]

 [[242 0.75]
  [3630 0.875]]]


<tf.Tensor: shape=(2, 2, 4), dtype=float32, numpy=
array([[[-0.,  0.,  0.,  0.],
        [ 0., -0.,  0.,  0.]],

       [[ 0.,  0., -0.,  0.],
        [ 0.,  0.,  0., -0.]]], dtype=float32)>

In [63]:
operands = tf.constant([
    [[1, 2, 3, 4], [5, 6, 7, 8]],
    [[11, 22, 33, 44], [55, 66, 77, 88]],
],dtype=tf.float32)

add = [1,0,0,0]
sub = [1,0,0,0]
mul = [1,0,0,0]
div = [1,0,0,0]

operators = tf.constant([
    [add, sub],
    [mul, div]
],dtype=tf.float32)

operands = tf.Variable(operands)
operators = tf.Variable(operators)
target = tf.constant([
    [[1 + 2, 3 - 4], [5 + 6, 7 - 8]],
    [[11 * 22, 33 / 44], [55 * 66, 77 / 88]],
],dtype=tf.float32)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    1e-1,
    decay_steps=100,
    decay_rate=1e-1,
    staircase=True)
opt = tf.keras.optimizers.Adam(lr_schedule)

steps = 10
for i in range(steps):
    with tf.GradientTape() as tape:
        result = operate(operands, operators)
        target_loss = tf.nn.l2_loss(target - result)

        # entropy_loss = tf.reduce_sum(entropy(operators))

        loss = target_loss # + entropy_loss * 0.

    variables = [operators]
    grads = tape.gradient(loss, variables)

    opt.apply_gradients(zip(grads, variables))
    operators.assign(to_prob_dist_all(operators))

    if i % (steps // 10) == 0:
        # tf.print(target_loss, entropy_loss, result, tf.argmax(operators, axis=-1))
        tf.print(loss)

tf.print(tf.round(operators * 100))


6194916.5
6194916.5
6194916.5
6194916.5
6194916.5
0
0
0
0
0
[[[100 0 0 0]
  [8 89 1 1]]

 [[8 1 89 1]
  [8 1 1 89]]]
