# Differentiable Bubble Sort

Differentiable implementation of bubble sort with configurable (learnable) comparator function


In [1]:
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import layers, Input
from tensorflow.keras.models import Model
import numpy as np

In [2]:
tf.executing_eagerly()

True

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Swap Function

Using linear interpolation for continious swap.

\begin{equation*}
new_a = a * t + b * (1 - t)
\end{equation*}
\begin{equation*}
new_b = b * t + a * (1 - t)
\end{equation*}

When $t = 0$, then $a$ and $b$ are swapped. When $t = 1$, they remain in place.

Other compare and swap strategies include [softmax approximation](https://github.com/johnhw/differentiable_sorting), [optimal transport](https://arxiv.org/pdf/1905.11885.pdf), [projecting into higher dimensional space](https://arxiv.org/pdf/2002.08871.pdf) etc


In [4]:
@tf.function
def swap(x, i, j, t=None):
    '''
        Linear interpolation swap
        x: Tensor - Expected dims: [2, feature_size]
        i: Tensor - Scalar, int-like
        j: Tensor - Scalar, int-like
        t: Tensor - Scalar, float-like
    '''
    x_shape = tf.shape(x)
    x_len = x_shape[0]
    
    if t is None:
        t = tf.zeros(x_shape)
    
    i_pos_mask = tf.expand_dims(tf.eye(x_len)[i],axis=-1)
    i_neg_mask = 1 - i_pos_mask
    i_element = K.sum(i_pos_mask * x, axis=0)
    
    j_pos_mask = tf.expand_dims(tf.eye(x_len)[j],axis=-1)
    j_neg_mask = 1 - j_pos_mask
    j_element = K.sum(j_pos_mask * x, axis=0)
    
    i_interp_element = t * i_element + (1 - t) * j_element
    j_interp_element = t * j_element + (1 - t) * i_element
    
    x = x * i_neg_mask + i_interp_element * i_pos_mask
    x = x * j_neg_mask + j_interp_element * j_pos_mask
    
    return x

In [5]:
x = tf.Variable([
    [1, 1, 0, 0],
    [1, 1, 0, 1],
    [1, 0, 0, 0],
    [1, 0, 1, 0],
    [1, 1, 1, 0]
],dtype=tf.float32)
t = tf.Variable(0 * tf.ones(tf.shape(x)),dtype=tf.float32)
i = tf.Variable(1,dtype=tf.int32)
j = tf.Variable(2,dtype=tf.int32)
with tf.GradientTape(persistent=True) as tape:
#     z = swap(x, i, j)
    z = swap(x, i, j, t)

print(z)
print(tape.gradient(z,x))
print(tape.gradient(z,t))

tf.Tensor(
[[1. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 1. 0. 1.]
 [1. 0. 1. 0.]
 [1. 1. 1. 0.]], shape=(5, 4), dtype=float32)
tf.Tensor(
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]], shape=(5, 4), dtype=float32)
tf.Tensor(
[[ 0.  0.  0.  0.]
 [ 0.  1.  0.  1.]
 [ 0. -1.  0. -1.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]], shape=(5, 4), dtype=float32)


In [6]:
x = tf.Variable([[1],[2],[3]],dtype=tf.float32)
t = tf.Variable(0 * tf.ones(tf.shape(x)),dtype=tf.float32)
i = tf.Variable(0,dtype=tf.int32)
j = tf.Variable(1,dtype=tf.int32)
with tf.GradientTape(persistent=True) as tape:
#     z = swap(x, i, j)
    z = swap(x, i, j, t)

print(z)
print(tape.gradient(z,x))
print(tape.gradient(z,t))

tf.Tensor(
[[2.]
 [1.]
 [3.]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[1.]
 [1.]
 [1.]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[-1.]
 [ 1.]
 [ 0.]], shape=(3, 1), dtype=float32)


## Bubble sort

Standard bubble sort implementation with injectable comparator function. It is to be noted that the $t$ parameter is used to decide whether to swap or not instead of having explicit conditionals.

In [7]:
@tf.function
def bubble_sort(x, cmp_fun):
    '''
        Bubble sort
        x: Tensor - Expected dims: [array_length, feature_size]
        cmp_fun: Function
    '''
    x_len = tf.shape(x)[0]
    for i in range(x_len):
        for j in range(i+1, x_len):
            cmp_x = tf.concat([x[i], x[j]], axis=0)
            cmp_x = tf.reshape(cmp_x, [1, 2, -1])
            t = cmp_fun(cmp_x)[0]
            x = swap(x, i, j, t)
    return x

### Sample comparator function

A sample comparator function for testing. The `tf.sign` makes it non-differentiable.

For the sake of the example. It counts the number of $1$s in the array.

In [8]:
@tf.function
def sample_comparator(x):
    '''
        x: Tensor - Expected dims: [batch_size, 2, feature_size]
    '''
    sv = tf.reduce_sum(x, axis=-1)
    sv = tf.subtract(sv[:,0], sv[:,1])
    return 1 - (tf.sign(sv) + 1) / 2

with tf.GradientTape() as tape:
    x = tf.Variable([
        [1,0,0,0],
        [1,1,1,1]
    ], dtype=tf.float32)
    cmp_x = tf.concat([x[0], x[1]], axis=0)
    cmp_x = tf.reshape(cmp_x, [1, 2, -1])
    cmp_result = sample_comparator(cmp_x)
    print(cmp_result)
    grad = tape.gradient(cmp_result, x)
    print(grad)

tf.Tensor([1.], shape=(1,), dtype=float32)
tf.Tensor(
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]], shape=(2, 4), dtype=float32)


In [9]:
x = tf.Variable([[3],[1],[2]],dtype=tf.float32)
with tf.GradientTape() as tape:
    z = bubble_sort(x, sample_comparator)
    print(z)
    print(tape.gradient(z,x))

tf.Tensor(
[[1.]
 [2.]
 [3.]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[1.]
 [1.]
 [1.]], shape=(3, 1), dtype=float32)


In [10]:
x = tf.Variable([
    [1, 1, 0],
    [1, 0, 0],
    [1, 1, 1]
],dtype=tf.float32)
with tf.GradientTape(persistent=True) as tape:
    z = bubble_sort(x, sample_comparator)

print(z)
print(tape.gradient(z,x))

tf.Tensor(
[[1. 0. 0.]
 [1. 1. 0.]
 [1. 1. 1.]], shape=(3, 3), dtype=float32)
tf.Tensor(
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]], shape=(3, 3), dtype=float32)


In [11]:
x = tf.Variable([
    [1, 1, 0, 0],
    [1, 1, 0, 1],
    [1, 0, 0, 0],
    [1, 0, 1, 0],
    [1, 1, 1, 0]
],dtype=tf.float32)
with tf.GradientTape(persistent=True) as tape:
    z = bubble_sort(x, sample_comparator)

print(z)
print(tape.gradient(z,x))

tf.Tensor(
[[1.  0.  0.  0. ]
 [1.  0.5 0.5 0. ]
 [1.  0.5 0.5 0. ]
 [1.  1.  0.5 0.5]
 [1.  1.  0.5 0.5]], shape=(5, 4), dtype=float32)
tf.Tensor(
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]], shape=(5, 4), dtype=float32)


In [12]:
data_gen = lambda: np.tril(np.ones((10,10),dtype=np.float32))
actual_data = data_gen()
actual_data

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], dtype=float32)

In [13]:
shuffled_data = data_gen()
np.random.shuffle(shuffled_data)
shuffled_data

array([[1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.]], dtype=float32)

In [14]:
z = bubble_sort(shuffled_data, sample_comparator)
z



<tf.Tensor: shape=(10, 10), dtype=float32, numpy=
array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], dtype=float32)>

## Learnable Comparator Function

Since the setup is end-to-end differentiable. We can use a DNN as the comparator function and expect it to learn using backpropagation.

In [15]:
class ComparatorBlock(layers.Layer):
    def __init__(self):
        super(ComparatorBlock, self).__init__()
        self.dense1 = layers.Dense(10, kernel_initializer="he_normal",activation='relu')
        self.dense2 = layers.Dense(10, kernel_initializer="he_normal",activation='relu')
        self.dense3 = layers.Dense(1, activation='sigmoid')

    def build(self, input_shape):
        super(ComparatorBlock, self).build(input_shape)

    def call(self, x):
        vector_len = tf.shape(x)[-1]
        h = tf.reshape(x, [-1, 2 * vector_len])
        h = self.dense1(h)
        h = self.dense2(h)
        h = self.dense3(h)
        return h

In [16]:
# temp_comparator = ComparatorBlock()
# batch_size = 10
# vector_length = 10
# input_shape = (batch_size, 2, vector_length)
# output_shape = (batch_size, 1)
# x = tf.random.normal(input_shape)
# y = tf.math.round(tf.random.uniform(output_shape, minval=0, maxval=1))
# result = temp_comparator(x)
# print(x.shape, result.shape, y.shape)
# # print(len(temp_comparator.trainable_variables))

# a = Input(shape=(2, vector_length))
# b = temp_comparator(a)
# m = Model(inputs=a, outputs=b)
# m.compile(loss='mse', optimizer='adam')
# m.fit(x=x,y=y,epochs=100,batch_size=batch_size)

In [17]:
learned_comparator = ComparatorBlock()
learned_comparator(tf.zeros((1,2,shuffled_data.shape[-1])))
z = bubble_sort(shuffled_data, learned_comparator)
# print(z)
print(tf.nn.l2_loss(z - actual_data))

tf.Tensor(16.030499, shape=(), dtype=float32)


In [18]:
x = tf.Variable(shuffled_data, dtype=tf.float32)
with tf.GradientTape() as tape:
    z = bubble_sort(x, learned_comparator)
    loss = tf.nn.l2_loss(z - actual_data)
    grads = tape.gradient(loss, learned_comparator.trainable_variables)
    tf.print(grads)

[[[0.783679068 0.932329535 0 ... -0.0278506912 0.24463512 -0.903960705]
 [0.642294288 0.776055098 0 ... -0.00857573748 0.182930738 -0.723977387]
 [0.563284457 0.672721684 0 ... -0.0109619275 0.143761888 -0.60566175]
 ...
 [0.436410964 0.465857357 0 ... -0.0219736807 0.0640842244 -0.465248823]
 [0.290339291 0.285001934 0 ... -0.0156117454 0.0527317 -0.297663778]
 [0.26131022 0.246905908 0 ... -0.0153777292 0.0523141176 -0.281999111]], [0.783679068 0.932329535 0 ... -0.0278506912 0.24463512 -0.903960705], [[4.47132301 -0.224669605 -0.704437256 ... 0 -3.05646396 -2.78740692]
 [3.67464828 -0.208235726 -0.604034722 ... 0 -2.51188135 -2.2907629]
 [0 0 0 ... 0 0 0]
 ...
 [0.00105429813 -0.00538781192 0.0422475114 ... 0 -0.000720694661 -0.000657245517]
 [0.0124988221 -0.00527670793 0.0423009917 ... 0 -0.00854383223 -0.00779172592]
 [0.918580711 -0.0355195403 -0.161287606 ... 0 -0.627914667 -0.572640061]], [2.05973172 -0.107041143 -0.262543887 ... 0 -1.40797186 -1.28402972], [[-6.49071932]
 [-0

## Training

We can train the setup end-to-end withing Adam optimizer.

In [19]:
x = tf.Variable(shuffled_data, dtype=tf.float32)
opt = tf.keras.optimizers.Adam(learning_rate=3e-4)

@tf.function
def train_step():
    with tf.GradientTape() as tape:
        z = bubble_sort(x, learned_comparator)
        loss = tf.nn.l2_loss(z - actual_data)
    var_list = learned_comparator.trainable_variables
    grads = tape.gradient(loss, var_list)
    opt.apply_gradients(zip(grads, var_list))
    return loss

for i in range(1000):
    loss = train_step()
    if i % 100 == 0:
        tf.print(loss)


16.0304985
10.5972567
5.54998255
4.58355618
3.0567193
1.97006762
1.39458871
0.917508185
0.507980704
0.332600266


In [20]:
z = bubble_sort(x, learned_comparator)
z = tf.round(z)
print(z - actual_data)

tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(10, 10), dtype=float32)
