In [1]:
import tensorflow as tf
import numpy as np


In [2]:
def pretty_print_guess_tensor(const_guess, operand_guess, operator_guess):
    # TODO: const_guess

    s = []

    for t in operand_guess:
        s += [f'x_{tf.argmax(t)}']

    operator_lookup = ['+','-', '*','/']
    result = s[::]
    for i, op_one_hot in enumerate(operator_guess):
        operators = tf.argmax(op_one_hot,axis=-1)
        left = result[::2]
        right = (result[1:] + result[:1])[::2]
        ops = operators[:len(left)]
        result = []
        for l, op, r in zip(left, ops, right):
            result += [f'({l} {operator_lookup[op]} {r})']


    return ' '.join(result)

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)

pretty_print_guess_tensor(const_guess, operand_guess, operator_guess)

'(((x_0 + x_1) + (x_2 - x_3)) + ((x_4 * x_5) - (x_6 / x_7)))'

In [3]:
@tf.function
def to_prob_dist_all(v):
    v2 = tf.sqrt(tf.square(v)+1e-9)
    # v2 = tf.sqrt(tf.square(v))
    m = tf.expand_dims(tf.reduce_sum(v2, axis=-1),-1)
    n = tf.math.divide_no_nan(v2, m)
    return n

tf.print(tf.argmax(operator_guess))
tf.print(tf.argmax(to_prob_dist_all(operator_guess)))

[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]


In [4]:
@tf.function
def cross_entropy(x, y, epsilon = 1e-9):
    return -2 * tf.reduce_mean(y * tf.math.log(x + epsilon), -1) / tf.math.log(2.)

x = tf.constant([
    [1.0,0],
    [0.5,0.5],
    [.75,.25]
    ]
,dtype=tf.float32)

with tf.GradientTape() as tape:
    tape.watch(x)
    y = cross_entropy(x, x)

tf.print(y)
tf.print(tape.gradient(y, x))

[-0 1 0.811278105]
[[-1.44269502 29.8973541]
 [-0.442695022 -0.442695022]
 [-1.02765751 0.557305]]


In [5]:
@tf.function
def entropy(x):
    return cross_entropy(x, x)

In [6]:
@tf.function
def dot(x, y):
    r = tf.multiply(x, y)
    return tf.reduce_sum(r, -1)

x = tf.constant([
    [2,2,2],
    [3,3,3]
])

dot(x, x)

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([12, 27])>

In [7]:
@tf.function
def resolve_values(const_guess, values, operand_guess):
    # TODO: const_guess

    operand_guess = to_prob_dist_all(operand_guess)

    operand_count = tf.shape(operand_guess)[0]
    values = tf.expand_dims(values, axis=0)
    values = tf.tile(values, [operand_count,1])
    
    result = dot(values, operand_guess)

    return result

v1 = tf.range(NUM_LEAVES)
cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
# operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
operand_guess = tf.constant([
       [ 0,  0,  0,  0,  0, 1.0, 0, 0],
       [ 0,  0,  0,  0,  0, 1.0, 0, 0],
       [ 0,  0,  0,  0,  0, 1.0, 0, 0],
       [ 0,  0,  0,  0,  0, 1.0, 0, 0],
       [ 0,  0,  0,  0,  0, 1.0, 0, 0],
       [ 0,  0,  0,  0,  0, 1.0, 0, 0],
       [ 0,  0,  0,  0,  0, 1.0, 0, 0],
       [ 0,  0,  0,  0,  0, 1.0, 0, 0],
])
operand_guess = tf.Variable(operand_guess)
# operand_guess = tf.Variable([
#     0,0.5,0.5,0, 0,0,0,0,
# ],dtype=tf.float32)
values = tf.cast(v1,dtype=tf.float32) + 1.0

target = tf.ones((NUM_LEAVES,), dtype=tf.float32) * 2.0

# opt = tf.keras.optimizers.Adam(3e-4)
opt = tf.keras.optimizers.Adam(1e-2)
resolved = resolve_values(const_guess, values, operand_guess)
for i in range(1000):
    with tf.GradientTape() as tape:
        resolved = resolve_values(const_guess, values, operand_guess)

        target_loss = tf.nn.l2_loss(resolved - target)
        entropy_loss = entropy(operand_guess)

        loss = target_loss + entropy_loss * 1e+1
    
    variables = [operand_guess]
    grads = tape.gradient(loss, variables)
    opt.apply_gradients(zip(grads, variables))

    operand_guess.assign(to_prob_dist_all(operand_guess))

    if i % 100 == 0:
        print_idx = 2
        dist = list(tf.round(operand_guess * 100).numpy().astype(np.int32))
        tf.print(resolved[print_idx], loss[print_idx], target_loss, entropy_loss[print_idx], dist[print_idx])

np.vstack(tf.round(operand_guess * 100).numpy().astype(np.int32))

5.99962091 63.9878693 63.9878693 -0 array([ 1,  1,  1,  1,  1, 94,  1,  1])
1.8825922 4.25354767 0.0551383644 0.419840902 array([40, 35, 23,  0,  0,  1,  0,  0])
1.84156811 4.22222662 0.100402653 0.412182391 array([41, 37, 20,  0,  0,  1,  1,  0])
1.77605987 4.15201044 0.200596735 0.395141363 array([42, 42, 15,  0,  0,  1,  1,  0])
1.61463773 3.40017581 0.594016314 0.280615956 array([42, 56,  1,  0,  0,  1,  0,  0])
1.73385024 2.85548139 0.283342779 0.257213861 array([30, 66,  1,  0,  0,  1,  1,  0])
1.90067697 1.96043122 0.0394602604 0.192097098 array([14, 81,  1,  0,  0,  1,  1,  1])
2.07097793 0.782594442 0.0201514643 0.0762443 array([ 0, 96,  1,  0,  0,  1,  1,  0])
2.10031509 0.911767364 0.0402524732 0.0871514902 array([ 0, 97,  1,  0,  0,  1,  1,  0])
2.10497761 0.913960576 0.0440811925 0.086987935 array([ 0, 97,  1,  0,  0,  1,  1,  0])


array([[ 1, 96,  0,  1,  1,  0,  0,  1],
       [ 1, 96,  0,  1,  1,  0,  0,  1],
       [ 1, 96,  0,  1,  1,  0,  0,  1],
       [ 1, 96,  0,  1,  1,  0,  0,  1],
       [ 1, 96,  0,  1,  1,  0,  0,  1],
       [ 1, 96,  0,  1,  1,  0,  0,  1],
       [ 1, 96,  0,  1,  1,  0,  0,  1],
       [ 1, 96,  0,  1,  1,  0,  0,  1]])

In [8]:
@tf.function
def operate(operands, operators):
    left = operands[::2]
    right = tf.roll(operands, shift=-1, axis=0)[::2]

    r_add = left + right
    r_sub = left - right
    r_mul = left * right
    r_div = tf.math.divide_no_nan(left, right)

    r = tf.stack([r_add, r_sub, r_mul, r_div], axis=1)

    return dot(r, operators)

operands = tf.range(NUM_LEAVES, dtype=tf.float32)
v2 = tf.range(NUM_OPERATORS)
operators = tf.constant([
    [1,0,0,0],
    [0,1,0,0],
    [0,0,1,0],
    [0,0,0,1],
],dtype=tf.float32)

with tf.GradientTape(persistent=True) as tape:
    tape.watch(operands)
    tape.watch(operators)
    result = operate(operands, operators)

tf.print(result)
tf.print(tf.reshape(tape.gradient(result, operands),(2,4)))
tf.print(tape.gradient(result, operators))

[1 -1 20 0.857142866]
[[1 1 1 -1]
 [5 4 0.142857149 -0.122448981]]
[[1 -1 0 0]
 [5 -1 6 0.666666687]
 [9 -1 20 0.8]
 [13 -1 42 0.857142866]]


In [9]:
def eager_process_block(operands, operators_arr):
    acc = operands

    for operators in operators_arr:
        num_operands = tf.shape(acc)[0]
        operators = operators[:num_operands // 2]
        acc = operate(acc, operators)

    return acc

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)
values = tf.cast(v1,dtype=tf.float32)
operands = resolve_values(const_guess, values, operand_guess)

with tf.GradientTape(persistent=True) as tape:
    tape.watch(operands)
    tape.watch(operator_guess)
    result = eager_process_block(operands, operator_guess)

tf.print(pretty_print_guess_tensor(const_guess, operand_guess, operator_guess))
x_0, x_1, x_2, x_3, x_4, x_5, x_6, x_7 = list(range(8))
tf.print((((x_0 + x_1) + (x_2 - x_3)) + ((x_4 * x_5) - (x_6 / x_7))))
tf.print(result)
tf.print(tf.reshape(tape.gradient(result, operands),(2,4)))
tf.print(tape.gradient(result, operator_guess))

(((x_0 + x_1) + (x_2 - x_3)) + ((x_4 * x_5) - (x_6 / x_7)))
19.142857142857142
[19.1424599]
[[1 1 1 -1]
 [4.99962044 3.99987364 -0.142875209 0.122467041]]
[[[1.00151753 -0.999747038 0.000885801506 0.000884682406]
  [5.00050545 -0.999747038 6.00139093 0.666765034]
  [8.99949455 -0.999746799 19.9978504 0.800035477]
  [-12.9984827 0.999747276 -41.9902649 -0.857160926]]

 [[0.00177049637 2.00126457 -1.00126421 -1.00177097]
  [20.855011 19.1406898 17.1413765 23.3303337]
  [0 0 0 0]
  [0 0 0 0]]

 [[19.1424599 -19.1389198 0.0338885225 9.24990891e-05]
  [0 0 0 0]
  [0 0 0 0]
  [0 0 0 0]]]


In [10]:
@tf.function
def unrolled_process_block_3(operands, operators_arr):
    acc = operands

    # Level 1
    operators = operators_arr[0]
    operators = operators[:4]
    acc = operate(acc, operators)

    # Level 2
    operators = operators_arr[1]
    operators = operators[:2]
    acc = operate(acc, operators)

    # Level 3
    operators = operators_arr[2]
    operators = operators[:1]
    acc = operate(acc, operators)

    return acc

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)
values = tf.cast(v1,dtype=tf.float32)
operands = resolve_values(const_guess, values, operand_guess)

with tf.GradientTape(persistent=True) as tape:
    tape.watch(operands)
    tape.watch(operator_guess)
    result = unrolled_process_block_3(operands, operator_guess)

tf.print(pretty_print_guess_tensor(const_guess, operand_guess, operator_guess))
x_0, x_1, x_2, x_3, x_4, x_5, x_6, x_7 = list(range(8))
tf.print((((x_0 + x_1) + (x_2 - x_3)) + ((x_4 * x_5) - (x_6 / x_7))))
tf.print(result)
tf.print(tf.reshape(tape.gradient(result, operands),(2,4)))
tf.print(tape.gradient(result, operator_guess))

(((x_0 + x_1) + (x_2 - x_3)) + ((x_4 * x_5) - (x_6 / x_7)))
19.142857142857142
[19.1424599]
[[1 1 1 -1]
 [4.99962044 3.99987364 -0.142875209 0.122467041]]
[[[1.00151753 -0.999747038 0.000885801506 0.000884682406]
  [5.00050545 -0.999747038 6.00139093 0.666765034]
  [8.99949455 -0.999746799 19.9978504 0.800035477]
  [-12.9984827 0.999747276 -41.9902649 -0.857160926]]

 [[0.00177049637 2.00126457 -1.00126421 -1.00177097]
  [20.855011 19.1406898 17.1413765 23.3303337]
  [0 0 0 0]
  [0 0 0 0]]

 [[19.1424599 -19.1389198 0.0338885225 9.24990891e-05]
  [0 0 0 0]
  [0 0 0 0]
  [0 0 0 0]]]


In [11]:
def print_collapsed_result(values, const_guess, operand_guess, operator_guess):
    # TODO: const_guess

    operands = tf.round(operand_guess)
    acc = resolve_values(const_guess, values, operands)
    operators = tf.round(operator_guess)

    result = eager_process_block(acc, operators)

    return result

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)

print_collapsed_result(values, const_guess, operand_guess, operator_guess)

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([19.14246], dtype=float32)>

In [12]:
def bind_opt_train_step(opt, entropy_weight=1e+2):
    @tf.function
    def train_step(const_guess, operand_guess, operator_guess, values, target):
        with tf.GradientTape() as tape:
            cg, opg, otg = const_guess, operand_guess, operator_guess
            # cg = tf.nn.softmax(cg)
            # opg = tf.nn.softmax(opg)
            # otg = tf.nn.softmax(otg)
            cg = to_prob_dist_all(cg)
            opg = to_prob_dist_all(opg)
            otg = to_prob_dist_all(otg)
            cg_entropy = 0.0 # TODO
            opg_entropy = tf.reduce_sum(entropy(opg))
            otg_entropy = tf.reduce_sum(entropy(otg))

            operands = resolve_values(cg, values, opg)
            result = unrolled_process_block_3(operands, otg)

            target_loss = tf.nn.l2_loss(result[0] - target)

            loss = target_loss + entropy_weight * (opg_entropy + otg_entropy)

        variables = [const_guess, operand_guess, operator_guess]
        grads = tape.gradient(loss, variables)
        opt.apply_gradients(zip(grads, variables))

        const_guess.assign(to_prob_dist_all(const_guess))
        operand_guess.assign(to_prob_dist_all(operand_guess))
        operator_guess.assign(to_prob_dist_all(operator_guess))

        return loss, target_loss, cg_entropy, opg_entropy, otg_entropy

    return train_step

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)
values = tf.cast(v1,dtype=tf.float32)

const_guess = tf.Variable(const_guess)
operand_guess = tf.Variable(operand_guess)
operator_guess = tf.Variable(operator_guess)

target = 19.0

opt = tf.keras.optimizers.Adam(3e-4)
# opt = tf.keras.optimizers.SGD(1e-1)
train_step = bind_opt_train_step(opt)

with tf.GradientTape(persistent=True) as tape:
    tape.watch(const_guess)
    tape.watch(operand_guess)
    tape.watch(operator_guess)

    result = train_step(const_guess, operand_guess, operator_guess, values, target)

tf.print(result)
tf.print(tape.gradient(result, operand_guess))
tf.print(tape.gradient(result, operator_guess))

(1.66787195, 0.00939718913, 0, 0.00725521892, 0.00932952948)
[[-0.0836868286 0 0 ... 0 0 0]
 [0 -0.0836334229 0 ... 0 0 0]
 [0 0 -0.0835876465 ... 0 0 0]
 ...
 [0 0 0 ... -0.0832176208 0 0]
 [0 0 0 ... 0 -0.083530426 0]
 [0 0 0 ... 0 0 -0.0834960938]]
[[[-0.0715637207 0 0 0]
  [0 -0.0717392 0 0]
  [0 0 -0.0711593628 0]
  [0 0 0 -0.0711593628]]

 [[-0.0716095 0 0 0]
  [0 -0.0716400146 0 0]
  [0 0 -0.0716095 0]
  [0 0 0 -0.0716095]]

 [[-0.0709457397 0 0 0]
  [0 -0.0716095 0 0]
  [0 0 -0.0716095 0]
  [0 0 0 -0.0716095]]]


In [13]:
opt = tf.keras.optimizers.Adam(3e-4)
# opt = tf.keras.optimizers.Adam(1e-1)
# opt = tf.keras.optimizers.SGD(1e-1)
train_step = bind_opt_train_step(opt, 1e-1)

NUM_LEAVES = 8
NUM_OPERATORS = 4
v1 = tf.range(NUM_LEAVES)
v2 = tf.range(NUM_OPERATORS)

cgv = tf.one_hot(v1 // 2, NUM_LEAVES//2, dtype=tf.float32)
const_guess = tf.concat([cgv, cgv],axis=1)
operand_guess = tf.one_hot(v1, NUM_LEAVES, dtype=tf.float32)
ogv = tf.expand_dims(tf.one_hot(v2, NUM_OPERATORS, dtype=tf.float32), axis=0)
operator_guess = tf.concat([ogv,ogv,ogv], axis=0)
values = tf.cast(v1,dtype=tf.float32)

const_guess = tf.Variable(const_guess)
operand_guess = tf.Variable(operand_guess)
operator_guess = tf.Variable(operator_guess)

target = 7.0
steps = 10000
for i in range(10000):
    loss, target_loss, cg_entropy, opg_entropy, otg_entropy = train_step(const_guess, operand_guess, operator_guess, values, target)

    if i % (steps // 10) == 0:
        cg = const_guess.numpy()
        opg = operand_guess.numpy()
        otg = operator_guess.numpy()
        collapsed_result = print_collapsed_result(values, cg, opg, otg)

        tf.print(i, collapsed_result[0], loss, target_loss, cg_entropy, opg_entropy, otg_entropy)
        # tf.print(pretty_print_guess_tensor(cg, opg, otg))
tf.print(pretty_print_guess_tensor(cg, opg, otg))

0 19.1424599 73.6561661 73.6545105 0 0.00725521892 0.00932952948
1000 19.1424599 0.281267792 0.000275083061 0 1.10822678 1.70170009
2000 19.1424599 0.235393107 0.000212619008 0 0.996016324 1.35578847
3000 19.1424599 0.210792184 0.000214195199 0 0.852604926 1.25317502
4000 19.1424599 0.193560198 0.00014558353 0 0.763911068 1.17023516
5000 19.1424599 0.166466385 0.000101841229 0 0.649720371 1.01392508
6000 19.1424599 0.152047545 9.2705377e-05 0 0.607504189 0.912044168
7000 19.1424599 0.128444076 5.57974672e-05 0 0.540567935 0.743314624
8000 19.1424599 0.0981969163 3.11541626e-05 0 0.456686705 0.524970889
9000 19.1424599 0.0844734833 0.00081806659 0 0.405964851 0.430589318
(((x_0 + x_1) + (x_2 - x_3)) + ((x_4 * x_5) - (x_6 / x_7)))
