## Part 1: Setup

In [1]:
import tensorflow as tf
import numpy as np
from PIL import Image

num_classes = 200
limit = 3
batch_size = 10
epochs = 10

def load(filename):
    file = open(filename, "r") 
    image_names = file.readlines()
    images = []
    labels = []
    for name in image_names:
        label = int(name[:3])
        if label <= limit:
            im = Image.open("images/" + name.rstrip('\n'))
            H, W = im.size
            pixels = list(im.getdata())
            if not type(pixels[0]) is int:
                # todo: right now we are discarding transparent images
                image = np.array([comp for pixel in pixels for comp in pixel]).reshape(-1, H, W, 3)
                images.append(image)
                # zero-index the label
                labels.append(label - 1)
    return images, labels

images_train, labels_train = load('train.txt')
images_test, labels_test = load('test.txt')

print(len(images_train))
print(len(images_test))

300
242


## Part 2: Network

In [2]:
# todo: use tf.contrib.layers.conv2d for 3D filter
def conv_relu(input_image, kernel_shape, bias_shape, stride = 2):
    strides = [1, stride, stride, 1]
    weights = tf.get_variable("weights", kernel_shape, initializer=tf.random_normal_initializer())
    biases = tf.get_variable("biases", bias_shape, initializer=tf.constant_initializer(0.0))
    conv = tf.nn.conv2d(input_image, weights, strides=strides, padding='SAME')
    return tf.nn.relu(conv + biases)

def conv_layers(input_image):
    # Variables created here will be named "convX/weights", "convX/biases".
    with tf.variable_scope("conv1"):
        out_channels = 35
        output = conv_relu(input_image, [5, 5, 3, out_channels], [out_channels], stride=1)
    with tf.variable_scope("conv2"):
        old_out_channels = out_channels
        out_channels = 50
        output = conv_relu(output, [5, 5, old_out_channels, out_channels], [out_channels])
    with tf.variable_scope("conv3"):
        old_out_channels = out_channels
        out_channels = 3
        return conv_relu(output, [5, 5, old_out_channels, out_channels], [out_channels])

def max_pool_2d_nxn_regions(inputs, output_size):
    inputs_shape = tf.shape(inputs)
    h = tf.cast(tf.gather(inputs_shape, 1), tf.int32)
    w = tf.cast(tf.gather(inputs_shape, 2), tf.int32)
    
    pooling_op = tf.reduce_max
    
#         pooling_op = tf.reduce_mean

    result = []
    n = output_size
    for row in range(output_size):
        for col in range(output_size):
            # start_h = floor(row / n * h)
            start_h = tf.cast(tf.floor(tf.multiply(row / n, tf.cast(h, tf.float32))), tf.int32)
            # end_h = ceil((row + 1) / n * h)
            end_h = tf.cast(tf.ceil(tf.multiply((row + 1) / n, tf.cast(h, tf.float32))), tf.int32)
            # start_w = floor(col / n * w)
            start_w = tf.cast(tf.floor(tf.multiply(col / n, tf.cast(w, tf.float32))), tf.int32)
            # end_w = ceil((col + 1) / n * w)
            end_w = tf.cast(tf.ceil(tf.multiply((col + 1) / n, tf.cast(w, tf.float32))), tf.int32)
            pooling_region = inputs[:, start_h:end_h, start_w:end_w, :]
            pool_result = pooling_op(pooling_region, axis=(1, 2))
            result.append(pool_result)
    return result

# Modified from RikHeijdens on https://github.com/tensorflow/tensorflow/issues/6011
def spp_layer(inputs, dimensions=[3, 2, 1]):
    # todo: fix this
    # print(inputs.get_shape()[1] < tf.constant(36, dtype=tf.int32))


#     if tf.less(inputs.get_shape()[1], dimensions[0] ** 2) or tf.less(inputs.get_shape()[2], dimensions[0] ** 2):
#         print(shape)
#         print('Size must be greater than {:d}x{:d}'.format(dimensions[0], dimensions[0]))
#         return None
    pool_list = []
    for pool_dim in dimensions:
        pool_list += max_pool_2d_nxn_regions(inputs, pool_dim)
    return tf.concat(pool_list, axis=1)

# todo: might be able to move this into session
def fc_layer(image, reuse):
    return tf.contrib.layers.fully_connected(image, num_classes, activation_fn=None, scope="fc", reuse=reuse)


tf.reset_default_graph()
fc_reuse = False
with tf.variable_scope("network") as scope:
    image_placeholders = []
    label_placeholders = []

    logits = []
    logit_labels = []

    for i in range(batch_size):
        image = tf.placeholder(tf.float32, (1,None,None,3), name='image_%d'%(i))
#         something = tf.identity(image, name='something')
#         if tf.reshape(something, [-1]).shape[0] == 6 is False:
        image_placeholders.append(image)
        label = tf.placeholder(tf.int64, name='label_%d'%(i))
        label_placeholders.append(label)

        logit = tf.to_float(image)
        logit = conv_layers(logit)
        logit = spp_layer(logit)

        if not logit is None:
            logit = fc_layer(logit, fc_reuse)
            logit = tf.reshape(logit, [-1])
            logits.append(logit)
            logit_labels.append(label)
            fc_reuse = True

        scope.reuse_variables()
        
#     logits = tf.convert_to_tensor(logits)
#     logit_labels = tf.convert_to_tensor(logit_labels)

# todo: check this!
# i wonder if there is going to be a naming problem
logits = tf.convert_to_tensor(logits)
logit_labels = tf.convert_to_tensor(logit_labels)

loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=logit_labels))
regularization_loss = tf.losses.get_regularization_loss()
total_loss = loss + 1e-6 * regularization_loss
optimizer = tf.train.MomentumOptimizer(0.001, 0.9)
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
    opt = optimizer.minimize(total_loss)
correct = tf.equal(tf.argmax(logits, -1), logit_labels)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

print( "Total number of variables used ", np.sum([v.get_shape().num_elements() for v in tf.trainable_variables()]) )

Total number of variables used  58813


## Part 3: Training

In [3]:
session = tf.Session()
session.run(tf.global_variables_initializer())

for epoch in range(epochs):
    np.random.seed(epoch)
    np.random.shuffle(images_train)
    np.random.seed(epoch)
    np.random.shuffle(labels_train)
    accuracy_vals, loss_vals = [], []
    for i in range(0, len(images_train) - batch_size + 1, batch_size):
        batch_images, batch_labels = images_train[i:i + batch_size], labels_train[i:i + batch_size]

#         batch_images, batch_labels = images_train[i:i + 2], labels_train[i:i + 2]
    
        # todo: this is not very good... (probably replace with 1 x 1 x 1 x 1 when I implement SPP filter)
        if batch_size - len(batch_images) > 0:
            print('testing diff: %d'%(batch_size - len(batch_images)))
            for j in range(len(batch_images), batch_size):
                batch_images.append(images_train[j - len(batch_images)])
                batch_labels.append(labels_train[j - len(batch_images)])

        fd = {**{k: v for k, v in zip(image_placeholders, batch_images)}, **{k: v for k, v in zip(label_placeholders, batch_labels )}}

        accuracy_val, loss_val, _ = session.run([accuracy, total_loss, opt], feed_dict=fd)
        accuracy_vals.append(accuracy_val)
        loss_vals.append(loss_val)
    val_correct = []
    for i in range(0, len(images_test), batch_size):
#         batch_images, batch_labels = images_test[i:i + batch_size], labels_test[i:i + batch_size]
        batch_images, batch_labels = images_train[i:i + batch_size], labels_train[i:i + batch_size]
        
        # todo: this is not very good... (probably replace with 1 x 1 x 1 x 1 when I implement SPP filter)
        if batch_size - len(batch_images) > 0:
            print('training diff: %d'%(batch_size - len(batch_images)))
            for j in range(len(batch_images), batch_size):
                batch_images.append(images_train[j - len(batch_images)])
                batch_labels.append(labels_train[j - len(batch_images)])
            
        fd = {**{k: v for k, v in zip(image_placeholders, batch_images)}, **{k: v for k, v in zip(label_placeholders, batch_labels )}}
        val_correct.extend( session.run(correct, feed_dict=fd) )
    print('[%3d] Accuracy: %0.3f  \t  Loss: %0.3f  \t  validation accuracy: %0.3f'%(epoch, np.mean(accuracy_vals), np.mean(loss_vals), np.mean(val_correct)))


# session.run([accuracy], feed_dict={image: images_train[0]})
# batch_images = images_train[:batch_size]
# batch_labels = labels_train[:batch_size]
# fd = {**{i: d for i, d in zip(image_placeholders, batch_images)}, **{i: d for i, d in zip(label_placeholders, batch_labels )}}
# accuracy_val, loss_val, _  = session.run([accuracy, total_loss, opt], feed_dict=fd)
# print(accuracy_val)
# print(loss_val)
# print(logits[0])

# batch_images = images_train[batch_size:batch_size+batch_size]
# batch_labels = labels_train[batch_size:batch_size+batch_size]
# accuracy_val, loss_val, _  = session.run([accuracy, total_loss, opt], feed_dict=fd)
# print(accuracy_val)
# print(loss_val)

# print(logits[0])
# print(len(tf.trainable_variables()))

[  0] Accuracy: 0.073  	  Loss: 2744934352155574272.000  	  validation accuracy: 0.104
[  1] Accuracy: 0.077  	  Loss: 5.267  	  validation accuracy: 0.108
[  2] Accuracy: 0.083  	  Loss: 5.238  	  validation accuracy: 0.100
[  3] Accuracy: 0.060  	  Loss: 5.210  	  validation accuracy: 0.112
[  4] Accuracy: 0.083  	  Loss: 5.182  	  validation accuracy: 0.108


KeyboardInterrupt: 