# CapsNet

This is the core algorithm of this thesis. It is based on [Dynamic Routing Between Capsules](https://arxiv.org/abs/1710.09829) paper by Sara Sabour, Nicholas Frosst and Geoffrey E. Hinton.

I added few changes to [handson-ml](https://github.com/ageron/handson-ml/blob/master/extra_capsnets.ipynb)'s implementation. 

There is one big change from the original algorithm and it is primary casules' filter sizes. First 2 convolution layers has filter size of 9. In this thesis, we used a sattelite image and cut the picture with respect to coordinate system that is given. In our tests, 2 different image size and channel types are used. Sizes are 32x32 and 9x9. For 9x9 images, if we use filter size of 9, we got negative filter size thus for 9x9 images, filter size is droped to 3x3.



In [1]:
from __future__ import division, print_function, unicode_literals

In [2]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [3]:
import numpy as np
import tensorflow as tf

In [4]:
from data import get_data_set

In [5]:
tf.reset_default_graph()


# np.random.seed(42)
# tf.set_random_seed(42)

In [6]:
x_train,y_train =  get_data_set("train",input_path = "../input/data_9x9_13band/")
x_test,y_test = get_data_set("test",input_path = "../input/data_9x9_13band/")

In [7]:
x_train.shape

(404, 9, 9, 13)

In [8]:
X = tf.placeholder(shape=[None, x_train.shape[1], x_train.shape[2], x_train.shape[3]], dtype=tf.float32, name="X")

In [9]:
X

<tf.Tensor 'X:0' shape=(?, 9, 9, 13) dtype=float32>

# Primary Capsule

In [10]:
kernel_size_conv1 = 3
kernel_size_conv2 = 3
stride_conv1 = 1
stride_conv2 = 2
img_size = x_test.shape[1]
img_size = int(((img_size - kernel_size_conv1) / stride_conv1) + 1)
img_size = int(((img_size - kernel_size_conv2) / stride_conv2) + 1)

In [11]:
caps1_n_maps = 32
caps1_n_caps = caps1_n_maps * img_size * img_size # * band_size
caps1_n_dims = 8

In [12]:
conv1 = tf.layers.conv2d(X, name="conv1", filters = 256,
                        kernel_size = kernel_size_conv1, strides = stride_conv1,
                        padding = "valid",activation=tf.nn.relu)

In [13]:
conv1

<tf.Tensor 'conv1/Relu:0' shape=(?, 7, 7, 256) dtype=float32>

In [14]:
conv2 = tf.layers.conv2d(conv1, name="conv2", filters = caps1_n_maps * caps1_n_dims,
                        kernel_size = kernel_size_conv2, strides = stride_conv2,
                        padding = "valid",activation=tf.nn.relu)

In [15]:
conv2

<tf.Tensor 'conv2/Relu:0' shape=(?, 3, 3, 256) dtype=float32>

In [16]:
caps1_raw = tf.reshape(conv2, [-1, caps1_n_caps, caps1_n_dims],
                       name="caps1_raw")

In [17]:
caps1_raw

<tf.Tensor 'caps1_raw:0' shape=(?, 288, 8) dtype=float32>

In [18]:

def squash(s, axis=-1, epsilon=1e-7, name=None):
    with tf.name_scope(name, default_name="squash"):
        squared_norm = tf.reduce_sum(tf.square(s), axis=axis,
                                     keep_dims=True)
        safe_norm = tf.sqrt(squared_norm + epsilon)
        squash_factor = squared_norm / (1. + squared_norm)
        unit_vector = s / safe_norm
        return squash_factor * unit_vector

In [19]:
# 15
caps1_output = squash(caps1_raw, name="caps1_output") 
caps1_output

Instructions for updating:
keep_dims is deprecated, use keepdims instead


<tf.Tensor 'caps1_output/mul:0' shape=(?, 288, 8) dtype=float32>

# Digit Capsules

In [20]:
output_shape = np.unique(y_train).shape[0]
caps2_n_caps = output_shape
caps2_n_dims = 16
type(output_shape)

int

In [21]:
init_sigma = 0.1

W_init = tf.random_normal(
    shape=(1, caps1_n_caps, caps2_n_caps, caps2_n_dims, caps1_n_dims),
    stddev=init_sigma, dtype=tf.float32, name="W_init")
W = tf.Variable(W_init, name="W")

In [22]:
batch_size = tf.shape(X)[0]
W_tiled = tf.tile(W, [batch_size, 1, 1, 1, 1], name="W_tiled")

In [23]:
caps1_output_expanded = tf.expand_dims(caps1_output, -1,
                                       name="caps1_output_expanded")
caps1_output_tile = tf.expand_dims(caps1_output_expanded, 2,
                                   name="caps1_output_tile")
caps1_output_tiled = tf.tile(caps1_output_tile, [1, 1, caps2_n_caps, 1, 1],
                             name="caps1_output_tiled")

In [24]:
W_tiled

<tf.Tensor 'W_tiled:0' shape=(?, 288, 4, 16, 8) dtype=float32>

In [25]:
caps1_output_tiled

<tf.Tensor 'caps1_output_tiled:0' shape=(?, 288, 4, 8, 1) dtype=float32>

In [26]:
caps2_predicted = tf.matmul(W_tiled, caps1_output_tiled,
                            name="caps2_predicted")

In [27]:
caps2_predicted

<tf.Tensor 'caps2_predicted:0' shape=(?, 288, 4, 16, 1) dtype=float32>

# Routing by agreement

In [28]:
raw_weights = tf.zeros([batch_size, caps1_n_caps, caps2_n_caps, 1, 1],
                       dtype=np.float32, name="raw_weights")
raw_weights

<tf.Tensor 'raw_weights:0' shape=(?, 288, 4, 1, 1) dtype=float32>

In [29]:
routing_weights = tf.nn.softmax(raw_weights, dim=2, name="routing_weights")

Instructions for updating:
dim is deprecated, use axis instead


In [30]:
weighted_predictions = tf.multiply(routing_weights, caps2_predicted,
                                   name="weighted_predictions")
weighted_sum = tf.reduce_sum(weighted_predictions, axis=1, keep_dims=True,
                             name="weighted_sum")

In [31]:

caps2_output_round_1 = squash(weighted_sum, axis=-2,
                              name="caps2_output_round_1")

In [32]:
caps2_output_round_1

<tf.Tensor 'caps2_output_round_1/mul:0' shape=(?, 1, 4, 16, 1) dtype=float32>

In [33]:
caps2_output_round_1_tiled = tf.tile(
    caps2_output_round_1, [1, caps1_n_caps, 1, 1, 1],
    name="caps2_output_round_1_tiled")

In [34]:
agreement = tf.matmul(caps2_predicted, caps2_output_round_1_tiled,
                      transpose_a=True, name="agreement")

In [35]:
raw_weights_round_2 = tf.add(raw_weights, agreement,
                             name="raw_weights_round_2")

In [36]:

routing_weights_round_2 = tf.nn.softmax(raw_weights_round_2,
                                        dim=2,
                                        name="routing_weights_round_2")
weighted_predictions_round_2 = tf.multiply(routing_weights_round_2,
                                           caps2_predicted,
                                           name="weighted_predictions_round_2")
weighted_sum_round_2 = tf.reduce_sum(weighted_predictions_round_2,
                                     axis=1, keep_dims=True,
                                     name="weighted_sum_round_2")
caps2_output_round_2 = squash(weighted_sum_round_2,
                              axis=-2,
                              name="caps2_output_round_2")

In [37]:
caps2_output = caps2_output_round_2

In [38]:
caps2_output

<tf.Tensor 'caps2_output_round_2/mul:0' shape=(?, 1, 4, 16, 1) dtype=float32>

# Estimated Class Probabilities (Length)

In [39]:

def safe_norm(s, axis=-1, epsilon=1e-7, keep_dims=False, name=None):
    with tf.name_scope(name, default_name="safe_norm"):
        squared_norm = tf.reduce_sum(tf.square(s), axis=axis,
                                     keep_dims=keep_dims)
        return tf.sqrt(squared_norm + epsilon)

In [40]:
y_proba = safe_norm(caps2_output, axis=-2, name="y_proba")

In [41]:
y_proba_argmax = tf.argmax(y_proba, axis=2, name="y_proba")
y_proba_argmax

<tf.Tensor 'y_proba_1:0' shape=(?, 1, 1) dtype=int64>

In [42]:
y_pred = tf.squeeze(y_proba_argmax, axis=[1,2], name="y_pred")
y_pred

<tf.Tensor 'y_pred:0' shape=(?,) dtype=int64>

In [43]:
y = tf.placeholder(shape=[None], dtype=tf.int64, name="y")

In [44]:

m_plus = 0.9
m_minus = 0.1
lambda_ = 0.5

In [45]:
caps2_output

<tf.Tensor 'caps2_output_round_2/mul:0' shape=(?, 1, 4, 16, 1) dtype=float32>

In [46]:
T = tf.one_hot(y, depth=output_shape, name="T")
T

<tf.Tensor 'T:0' shape=(?, 4) dtype=float32>

In [47]:
caps2_output_norm = safe_norm(caps2_output, axis=-2, keep_dims=True,
                              name="caps2_output_norm")

In [48]:
present_error_raw = tf.square(tf.maximum(0., m_plus - caps2_output_norm),
                              name="present_error_raw")
present_error = tf.reshape(present_error_raw, shape=(-1, output_shape),
                           name="present_error")
present_error

<tf.Tensor 'present_error:0' shape=(?, 4) dtype=float32>

In [49]:
absent_error_raw = tf.square(tf.maximum(0., caps2_output_norm - m_minus),
                             name="absent_error_raw")
absent_error = tf.reshape(absent_error_raw, shape=(-1, output_shape),
                          name="absent_error")

In [50]:
L = tf.add(T * present_error, lambda_ * (1.0 - T) * absent_error,
           name="L")
L

<tf.Tensor 'L:0' shape=(?, 4) dtype=float32>

In [51]:
loss = tf.reduce_mean(tf.reduce_sum(L, axis=1), name="margin_loss")


# Accuracy & Optimizer

In [52]:
correct = tf.equal(y, y_pred, name="correct")
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy")

In [53]:
optimizer = tf.train.AdamOptimizer(learning_rate = 0.001)
training_op = optimizer.minimize(loss, name="training_op")

In [54]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Lets TRAIN

In [55]:
n_epochs = 150
batch_size = 15
restore_checkpoint = True
num_train_examples = x_train.shape[0]
num_test_examples = x_test.shape[0]
n_iterations_per_epoch = num_train_examples // batch_size
n_iterations_validation = num_test_examples // batch_size
best_loss_val = np.infty
checkpoint_path = "./my_capsule_network"

print("training set size : ", num_train_examples)
print("validation set size : ", num_test_examples)


training set size :  404
validation set size :  793


In [56]:
def shuffle(X,Y):
    X_per = []
    Y_per = []
    permutation = np.random.permutation(X.shape[0])
    for i in permutation:
        X_per.append(X[i])
        Y_per.append(Y[i])
        
    return np.array(X_per),np.array(Y_per)


In [57]:
# x_train[0:40].shape
with tf.Session() as sess:
#     if restore_checkpoint and tf.train.checkpoint_exists(checkpoint_path):
#         saver.restore(sess, checkpoint_path)
#     else:
    init.run()
    
    for epoch in range(n_epochs):
        x_train,y_train = shuffle(x_train,y_train)
        x_test,y_test = shuffle(x_test,y_test)
        for iteration in range(0, n_iterations_per_epoch):
            X_batch = x_train[iteration * batch_size: (iteration * batch_size) + batch_size ]
            y_batch = y_train[iteration * batch_size: (iteration * batch_size) + batch_size ]
            
            _, loss_train = sess.run(
                [training_op, loss],
                feed_dict={X: X_batch,
                           y: y_batch})
            
            print("\rIteration: {}/{} ({:.1f}%)  Loss: {:.5f}".format(
                      iteration, n_iterations_per_epoch,
                      iteration * 100 / n_iterations_per_epoch,
                      loss_train),
                  end="")
        
        
        loss_vals = []
        acc_vals = []
        
        for iteration in range(0, n_iterations_validation):
            X_batch = x_test[iteration * batch_size: (iteration * batch_size) + batch_size ]
            y_batch = y_test[iteration * batch_size: (iteration * batch_size) + batch_size ]
            
            loss_val, acc_val = sess.run(
                    [loss, accuracy],
                    feed_dict={X: X_batch,
                               y: y_batch})
            loss_vals.append(loss_val)
            acc_vals.append(acc_val)
            
            print("\rEvaluating the model: {}/{} ({:.1f}%)".format(
                      iteration, n_iterations_validation,
                      iteration * 100 / n_iterations_validation),
                  end=" " * 10)
            
            
        loss_val = np.mean(loss_vals)
        acc_val = np.mean(acc_vals)
        print("\rEpoch: {}  Val accuracy: {:.4f}%  Loss: {:.6f}{}".format(
            epoch + 1, acc_val * 100, loss_val,
            " (improved)" if loss_val < best_loss_val else ""))
        if loss_val < best_loss_val:
#             save_path = saver.save(sess, checkpoint_path)
            best_loss_val = loss_val

Epoch: 1  Val accuracy: 72.5641%  Loss: 0.215379 (improved)
Epoch: 2  Val accuracy: 84.3590%  Loss: 0.120701 (improved)
Epoch: 3  Val accuracy: 79.6154%  Loss: 0.134813
Epoch: 4  Val accuracy: 85.6410%  Loss: 0.101840 (improved)
Epoch: 5  Val accuracy: 87.6923%  Loss: 0.093770 (improved)
Epoch: 6  Val accuracy: 78.5897%  Loss: 0.143587
Epoch: 7  Val accuracy: 85.2564%  Loss: 0.103384
Epoch: 8  Val accuracy: 80.5128%  Loss: 0.133410
Epoch: 9  Val accuracy: 85.5128%  Loss: 0.096175
Epoch: 10  Val accuracy: 87.1795%  Loss: 0.086118 (improved)
Epoch: 11  Val accuracy: 86.2821%  Loss: 0.095586
Epoch: 12  Val accuracy: 86.5385%  Loss: 0.092326
Epoch: 13  Val accuracy: 86.6667%  Loss: 0.090499
Epoch: 14  Val accuracy: 88.0769%  Loss: 0.094461
Epoch: 15  Val accuracy: 89.2308%  Loss: 0.080076 (improved)
Epoch: 16  Val accuracy: 86.5385%  Loss: 0.098274
Epoch: 17  Val accuracy: 88.0769%  Loss: 0.087442
Epoch: 18  Val accuracy: 85.5128%  Loss: 0.099029
Epoch: 19  Val accuracy: 86.4102%  Loss: 0.