In [18]:
import tensorflow as tf
import numpy as np
from PIL import Image

num_classes = 200

def load(filename):
    file = open(filename, "r") 
    image_names = file.readlines()
    images = []
    labels = []
    for name in image_names:
        label = int(name[:3])
        if label <= 5:
            im = Image.open("images/" + name.rstrip('\n'))
            H, W = im.size
            pixels = list(im.getdata())
            if not type(pixels[0]) is int:
                # todo: right now we are discarding transparent images
                image = np.array([comp for pixel in pixels for comp in pixel]).reshape(-1, H, W, 3)
                images.append(image)
                # zero-index the label
                labels.append(label - 1)
    return images, labels

images_train, labels_train = load('train.txt')

print(len(images_train))
print(len(labels_train))

150
150


In [21]:
# todo: use tf.contrib.layers.conv2d for 3D filter
def conv_relu(input_image, kernel_shape, bias_shape, stride = 2):
    strides = [stride, stride, stride, stride]
    weights = tf.get_variable("weights", kernel_shape, initializer=tf.random_normal_initializer())
    biases = tf.get_variable("biases", bias_shape, initializer=tf.constant_initializer(0.0))
    conv = tf.nn.conv2d(input_image, weights, strides=strides, padding='SAME')
    return tf.nn.relu(conv + biases)

def conv_layers(input_image):
    # Variables created here will be named "convX/weights", "convX/biases".
    with tf.variable_scope("conv1"):
        out_channels = 15
        output = conv_relu(input_image, [5, 5, 3, out_channels], [out_channels], stride=1)
    with tf.variable_scope("conv2"):
        old_out_channels = out_channels
        out_channels = 20
        output = conv_relu(output, [5, 5, old_out_channels, out_channels], [out_channels])
    with tf.variable_scope("conv3"):
        old_out_channels = out_channels
        out_channels = 3
        return conv_relu(output, [5, 5, old_out_channels, out_channels], [out_channels])
    
# https://github.com/tensorflow/tensorflow/issues/6011
def spp_layer(image, levels=[6, 3, 2, 1]):
    shape = image.get_shape().as_list()
    if shape[1] < levels[0] ** 2 or shape[2] < levels[0] ** 2:
        print(shape)
        print('Size must be greater than {:d}x{:d}'.format(levels[0], levels[0]))
        return None

    with tf.variable_scope('spp'):
        pool_outputs = []
        for level in levels:
            # todo: figure out why it is surrounded by 1
            window_size = [1] + [np.ceil(d / level).astype(np.int32) for d in shape[1:3]] + [1]
            strides = [1] + [np.floor(d / level + 1).astype(np.int32) for d in shape[1:3]] + [1]
            
            pool = tf.nn.max_pool(image, ksize=window_size, strides=strides, padding='SAME')
            pool_outputs.append(tf.reshape(pool, [shape[0], -1]))
        spp_pool = tf.concat(pool_outputs, axis=1)
    return spp_pool

def fc_layer(image, reuse):
    return tf.contrib.layers.fully_connected(image, num_classes, activation_fn=None, scope="fc", reuse=reuse)
    
tf.reset_default_graph()

inputs = tf.placeholder(tf.float32, (None,None,None,3))
labels = tf.placeholder(tf.int64, (None), name='labels')

logits = []
logit_labels = []

with tf.variable_scope("network") as scope:
    fc_reuse = False
    for index, image in enumerate(inputs):
        output = tf.convert_to_tensor(image)
        output = tf.to_float(output)
        output = conv_layers(output)
        output = spp_layer(output)
        if not output is None:
            output = fc_layer(output, fc_reuse)
            output = tf.reshape(output, [-1])
            logits.append(output)
            logit_labels.append(labels[index])
            fc_reuse = True

        scope.reuse_variables()

    print(logits)
    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=logit_labels))

regularization_loss = tf.losses.get_regularization_loss()
total_loss = loss + 1e-6 * regularization_loss
# todo: play around with optimizer
optimizer = tf.train.MomentumOptimizer(0.001, 0.9)
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
    opt = optimizer.minimize(total_loss)
correct = tf.equal(tf.argmax(logits, 1), logit_labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
print( "Total number of variables used: ", np.sum([v.get_shape().num_elements() for v in tf.trainable_variables()]) )

TypeError: 'Tensor' object is not iterable.

In [None]:
images_test, labels_test = load('test.txt')

epochs = 20
batch_size = 32
sess = tf.Session()
sess.run(tf.global_variables_initializer())

for epoch in range(epochs):
    np.random.seed(epoch)
    np.random.shuffle(images_train)
    np.random.seed(epoch)
    np.random.shuffle(labels_train)
    accuracy_vals, loss_vals = [], []
    
    for i in range(0, images_train.shape[0] - batch_size + 1, batch_size):
        batch_images, batch_labels = images_train[i:i + batch_size], labels_train[i:i + batch_size]
        accuracy_val, loss_val, _ = sess.run([accuracy, total_loss, opt], feed_dict={inputs: batch_images, labels: batch_labels})
        accuracy_vals.append(accuracy_val)
        loss_vals.append(loss_val)

    val_correct = []
    for i in range(0, image_test.shape[0], batch_size):
        batch_images, batch_labels = image_val[i:i + batch_size], label_val[i:i + batch_size]
        val_correct.extend( sess.run(correct, feed_dict={inputs: batch_images, labels: batch_labels}) )
    print('[%3d] Accuracy: %0.3f  \t  Loss: %0.3f  \t  validation accuracy: %0.3f'%(epoch, np.mean(accuracy_vals), np.mean(loss_vals), np.mean(val_correct)))

In [None]:
# FAILED SPPs
# # https://github.com/tensorflow/tensorflow/issues/6011
# def spp_layer(image, levels=[6, 3, 2, 1], name = 'SPP_layer'):
#     shape = image.get_shape()[1:3].as_list()
#     with tf.variable_scope(name):
#         pool_outputs = []
#         print(image.shape)
#         for level in levels:
#             window_size = [np.ceil(d / level).astype(np.int32) for d in shape]
#             strides = [np.floor(d / level + 1).astype(np.int32) for d in shape]
            
#             # todo: figure out why it is surrounded by 1 
#             ksize = [1, window_size[0], window_size[1], 1]
#             strides = [1, strides[0], strides[1], 1]
            
#             print(ksize)
#             print(strides)
            
#             pool = tf.nn.max_pool(image, ksize=ksize, strides=strides, padding='SAME')
#             pool_outputs.append(tf.reshape(pool, [shape[0], -1]))
#         spp_pool = tf.concat(pool_outputs, axis=1)
#         print(spp_pool)
#     return spp_pool

# # https://github.com/tensorflow/tensorflow/issues/6011
# def spp_layer(image, levels=[6, 3, 2, 1], name = 'SPP_layer'):
#     shape = image.get_shape().as_list()
#     with tf.variable_scope(name):
#         pool_outputs = []
#         for level in levels:
#             # todo: figure out why it is surrounded by 1 
#             window_size = [1] + [np.ceil(d / level).astype(np.int32) for d in shape[1:3]] + [1]
#             strides = [1, np.floor(shape[1] / level + 1).astype(np.int32), np.floor(shape[2] / level + 1), 1]
            
#             pool = tf.nn.max_pool(image, ksize=window_size, strides=strides, padding='SAME')
#             pool_outputs.append(tf.reshape(pool, [shape[0], -1]))
#         spp_pool = tf.concat(pool_outputs, axis=1)
#     return spp_pool

# MISC

# def conv_layers(input_image):
#     # Variables created here will be named "convX/weights", "convX/biases".
#     with tf.variable_scope("conv1"):
#         out_channels = 10
#         output = conv_relu(input_image, [15, 15, 3, out_channels], [out_channels])
#     with tf.variable_scope("conv2"):
#         old_out_channels = out_channels
#         out_channels = 15
#         output = conv_relu(output, [5, 5, old_out_channels, out_channels], [out_channels])
#     with tf.variable_scope("conv3"):
#         old_out_channels = out_channels
#         out_channels = 15
#         output = conv_relu(output, [5, 5, old_out_channels, out_channels], [out_channels])
#     with tf.variable_scope("conv4"):
#         old_out_channels = out_channels
#         out_channels = 3
#         return conv_relu(output, [5, 5, old_out_channels, out_channels], [out_channel