In [1]:
import numpy as np
import tensorflow as tf
import cv2
import json

# Define VGG-16 Net

In [None]:
from kaffe.tensorflow import Network

class VGG_ILSVRC_16_layers(Network):
    def setup(self):
        (self.feed('input')
             .conv(3, 3, 64, 1, 1, name='conv1_1')
             .conv(3, 3, 64, 1, 1, name='conv1_2')
             .max_pool(2, 2, 2, 2, name='pool1')
             .conv(3, 3, 128, 1, 1, name='conv2_1')
             .conv(3, 3, 128, 1, 1, name='conv2_2')
             .max_pool(2, 2, 2, 2, name='pool2')
             .conv(3, 3, 256, 1, 1, name='conv3_1')
             .conv(3, 3, 256, 1, 1, name='conv3_2')
             .conv(3, 3, 256, 1, 1, name='conv3_3')
             .max_pool(2, 2, 2, 2, name='pool3')
             .conv(3, 3, 512, 1, 1, name='conv4_1')
             .conv(3, 3, 512, 1, 1, name='conv4_2')
             .conv(3, 3, 512, 1, 1, name='conv4_3')
             .max_pool(2, 2, 2, 2, name='pool4')
             .conv(3, 3, 512, 1, 1, name='conv5_1')
             .conv(3, 3, 512, 1, 1, name='conv5_2')
             .conv(3, 3, 512, 1, 1, name='conv5_3')
             .max_pool(2, 2, 2, 2, name='pool5')
             .fc(4096, name='fc6')
             .fc(4096, name='fc7')
             .fc(1000, relu=False, name='fc8')
             .softmax(name='prob'))

In [None]:
inputs = tf.placeholder(tf.float32, [None, 224, 224, 3])
# initialize VGG-16 net
net = VGG_ILSVRC_16_layers({'input': inputs})
# we only need to extract fc7 layer features
fc7 = net.layers['fc7']

In [None]:
def load_images(image_list):
    """Load images into numpy arrays from an image_list."""
    # specific for vgg model, subtract mean pixel value
    mean_pixel = [103.939, 116.779, 123.68]
    image_full_batch = []
    for image_path in image_list:
        # 1 means read color image
        image_tensor = cv2.imread(image_path, 1).astype('float32')
        image_full_batch.append(image_tensor)
    # put all image tensors together into the full batch
    image_full_batch = np.stack(image_full_batch)
    for c in xrange(3):
        image_full_batch[:, :, :, c] -= mean_pixel[c]
    return image_full_batch

In [None]:
def generate_batch(inputs, batch_size=100):
    """Generate mini batches for given inputs."""
    full_size = inputs.shape[0]
    iterations = full_size // batch_size
    count = 0
    while count < iterations:
        yield inputs[count*batch_size:(count+1)*batch_size]
        count +=1
    # finally, yield data that has not been used yet 
    # maybe we should check the size, in case that there is only one example left (dims decreases)
    if count*batch_size != full_size:
        yield inputs[count*batch_size:]

In [None]:
def extract_image_features(images):
    """Extract fc7 features using vgg-16 model."""
    with tf.Session() as sess:
        # Load the data
        sess.run(tf.initialize_all_variables())
        net.load('vgg_16.tfmodel', sess)

        # train_image_embeddings
        image_features = []
        for image_batch in generate_batch(images, batch_size = 200):
            feed = {inputs: image_batch}
            batch_image_features = sess.run([fc7], feed_dict=feed)
            # notice the 0, since the dimension of bacth_image_features is 1xbatch_sizex4096
            image_features.append(batch_image_features[0])
            
    image_features = np.vstack(image_features)
    return image_features

# Load images

In [None]:
with open('Corel5k/train_image_list.json') as f:
    train_image_list = json.load(f)
train_images = load_images(train_image_list)

with open('Corel5k/test_image_list.json') as f:
    test_image_list = json.load(f)
test_images = load_images(test_image_list)

# Extract image features and save for later use

In [None]:
train_image_features = extract_image_features(train_images)
#with open("train_image_features.npy", "w") as f:
    #np.save(f, train_image_features)

test_image_features = extract_image_features(test_images)
#with open("test_image_features.npy", "w") as f:
    #np.save(f, test_image_features)

In [5]:
# Load image features directly
with open("train_image_features.npy", "r") as f:
    train_image_features = np.load(f)
    
with open("test_image_features.npy", "r") as f:
    test_image_features = np.load(f)

# Image embedding

In [14]:
num_hidden = 512

image_features = tf.placeholder(tf.float32, [None, 4096])
image_embedding_weights = tf.Variable(tf.truncated_normal([4096, num_hidden], stddev=0.01))
image_embedding_bias = tf.Variable(tf.constant(0.1, shape=[num_hidden]))
h0 = tf.matmul(image_features, image_embedding_weights) + image_embedding_bias
c0 = tf.zeros_like(h0)

# Load annotations and create label inputs/targets

In [28]:
with open('Corel5k/train_annotations.json') as f:
    train_annotations = json.load(f)

with open('Corel5k/test_annotations.json') as f:
    test_annotations = json.load(f)

vocabulary_size = max([max(x) for x in train_annotations if x]) # for Corel5k specifically   
print vocabulary_size

# create inputs and targets with "START"/"STOP" signal
# note that Python is 0-indexed, and labels do not use 0, so we use 0 for special signal
train_label_inputs = [[0] + annotation for annotation in train_annotations]
train_label_targets = [annotation + [0] for annotation in train_annotations]
# for test data
test_label_inputs = [[0] + annotation for annotation in test_annotations]
test_label_targets = [annotation + [0] for annotation in test_annotations]

# transform to one-hot vector
train_label_targets = [np.eye(vocabulary_size+1)[x] for x in train_label_targets]
test_label_targets = [np.eye(vocabulary_size+1)[x] for x in test_label_targets]

260


# Label embedding

In [8]:
embedding_size = 512
max_length = 6 # for Corel5k, maximum length of annotation is 5

# note that we have another START or STOP signal in addition to the real labels
label_embedding_matrix = tf.Variable(
    tf.random_uniform([vocabulary_size + 1, embedding_size], -1.0, 1.0))

label_input = tf.placeholder(tf.int32, [None])

# Padding label inputs with zero-vectors
label_embedding = tf.concat(
    0, [tf.nn.embedding_lookup(label_embedding_matrix, label_input), 
        tf.zeros([max_length - tf.shape(label_input)[0], embedding_size])])
# add one new axis (batch_size==1)
label_embedding = tf.reshape(label_embedding, [1,-1,embedding_size])

# mask for setting sequence_length for each sample
def length(data):
    used = tf.sign(tf.reduce_max(tf.abs(data), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

# LSTM model

In [15]:

num_layers = 1
dropout = tf.placeholder(tf.float32)

# Why use state_is_tuple???
lstm = tf.nn.rnn_cell.LSTMCell(num_hidden, state_is_tuple=True)  
lstm = tf.nn.rnn_cell.DropoutWrapper(lstm, output_keep_prob=dropout)
lstm = tf.nn.rnn_cell.MultiRNNCell([lstm] * num_layers, state_is_tuple=True)
output, state = tf.nn.dynamic_rnn(lstm, label_embedding, dtype=tf.float32, 
                                  initial_state=(c0, h0), sequence_length=length(label_embedding))

# Softmax prediction and compute cross-entropy loss

In [36]:
# reshape output to be used in mini-batch fc layer
output = tf.reshape(output, [-1, num_hidden])
output = tf.gather(output, tf.range(length(label_embedding)[0]))
# fc layer
weight = tf.Variable(tf.truncated_normal([num_hidden, vocabulary_size+1], stddev=0.01))
bias = tf.Variable(tf.constant(0.1, shape=[vocabulary_size+1]))

prediction = tf.nn.softmax(tf.matmul(output, weight) + bias)

# target should be one-hot vector?
target = tf.placeholder(tf.float32, [None, vocabulary_size + 1])
cross_entropy = -tf.reduce_sum(
    target * tf.log(prediction), reduction_indices=[1])
cross_entropy = tf.reduce_mean(cross_entropy)

optimizer = tf.train.AdamOptimizer(learning_rate=0.0001, epsilon=1e-2).minimize(cross_entropy)

correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(target, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
sess = tf.InteractiveSession()

In [37]:
# start training
init = tf.initialize_all_variables()
sess.run(init)
max_epoch = 5
batch_loss = 0
batch_accuracy = 0
batch_size = 200
for epoch in xrange(max_epoch):
    for i in xrange(len(train_label_inputs)):
        feed = {
            image_features: np.reshape(train_image_features[i], [-1, 4096]),
            label_input:train_label_inputs[i], 
            target:train_label_targets[i],
            dropout:0.5, 
        }
        loss, tmp_accuracy, _ = sess.run([cross_entropy, accuracy, optimizer], feed_dict=feed)
        batch_loss += loss
        batch_accuracy += tmp_accuracy
        if i % batch_size == 0 and i != 0:
            print "epoch: %d, iteration: %d, batch_loss: %.3f, batch_accuracy: %.3f" % (
                epoch, i, batch_loss / batch_size, batch_accuracy / batch_size)
            batch_loss = 0
            batch_accuracy = 0

epoch: 0, iteration: 200, batch_loss: 5.511, batch_accuracy: 0.194
epoch: 0, iteration: 400, batch_loss: 5.325, batch_accuracy: 0.236
epoch: 0, iteration: 600, batch_loss: 4.898, batch_accuracy: 0.270
epoch: 0, iteration: 800, batch_loss: 4.206, batch_accuracy: 0.242
epoch: 0, iteration: 1000, batch_loss: 3.985, batch_accuracy: 0.254
epoch: 0, iteration: 1200, batch_loss: 4.100, batch_accuracy: 0.272
epoch: 0, iteration: 1400, batch_loss: 3.675, batch_accuracy: 0.326
epoch: 0, iteration: 1600, batch_loss: 4.122, batch_accuracy: 0.288
epoch: 0, iteration: 1800, batch_loss: 3.866, batch_accuracy: 0.247
epoch: 0, iteration: 2000, batch_loss: 3.929, batch_accuracy: 0.268
epoch: 0, iteration: 2200, batch_loss: 3.768, batch_accuracy: 0.300
epoch: 0, iteration: 2400, batch_loss: 3.753, batch_accuracy: 0.301
epoch: 0, iteration: 2600, batch_loss: 3.533, batch_accuracy: 0.268
epoch: 0, iteration: 2800, batch_loss: 3.560, batch_accuracy: 0.292
epoch: 0, iteration: 3000, batch_loss: 3.558, batch_

# Test per-sample accuracy

In [40]:
test_accuracy = 0
for i in xrange(len(test_label_inputs)):
    feed = {
        image_features: np.reshape(test_image_features[i], [-1, 4096]),
        label_input:test_label_inputs[i], 
        target:test_label_targets[i],
        dropout:1.0, 
    }
    tmp_accuracy = sess.run(accuracy, feed_dict=feed)
    test_accuracy += tmp_accuracy
print "test_accuracy: %.3f" % (test_accuracy / len(test_label_inputs))

test_accuracy: 0.394
