In [11]:
import tensorflow as tf
import numpy as np
import time
import pprint
import os
import math
import cPickle
import random

token_file = 'results_20130124.token'
output_dir = 'dir_runs/06091043'
input_filenamepatten = 'features/*'
vocab_file = 'vocab.txt'
hp_config = ''

In [2]:
def parse_token_file(token_file):
    image_name_to_tokens = {}
    with open(token_file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        image_id, description = line.strip('\r\n').split('\t')
        image_name, _ = image_id.split('#')
        image_name_to_tokens.setdefault(image_name, [])
        image_name_to_tokens[image_name].append(description)
    return image_name_to_tokens
  
def convert_token_to_id(image_name_to_tokens, vocab):
    image_name_to_token_ids = {}
    for image_name in image_name_to_tokens:
        image_name_to_token_ids.setdefault(image_name, [])
        descriptions = image_name_to_tokens[image_name]
        for description in descriptions:
            token_ids = vocab.encode(description)
            image_name_to_token_ids[image_name].append(token_ids)
    return image_name_to_token_ids

class Vocab(object):
    def __init__(self, filename, word_num_threshold):
        self._id_to_word = {}
        self._word_to_id = {}
        self._unk = -1
        self._eos = -1
        self._word_num_threshold = word_num_threshold
        self._read_dict(filename)

    def _read_dict(self, filename):
        with tf.gfile.GFile(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, occurence = line.strip('\r\n').split('\t')
            occurence = int(occurence)
            if word != '<UNK>' and occurence < self._word_num_threshold:
                continue
            idx = len(self._id_to_word)
            if word == '<UNK>':
                self._unk = idx
            elif word == '.':
                self._eos = idx
            if idx in self._id_to_word or word in self._word_to_id:
                raise Exception('duplicate words in vocab file')
            self._word_to_id[word] = idx
            self._id_to_word[idx] = word

    @property
    def unk(self):
        return self._unk

    @property
    def eos(self):
        return self._eos

    def word_to_id(self, word):
        return self._word_to_id.get(word, self.unk)

    def id_to_word(self, cur_id):
        return self._id_to_word[cur_id]

    def size(self):
        return len(self._word_to_id)

    def encode(self, sentence):
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split(' ')]
        return word_ids

In [3]:
class ImageCaptionData(object):
    def __init__(self,
                 image_name_to_token_ids,
                 image_feature_filepattern,
                 num_timesteps,
                 vocab):
        self._vocab = vocab
        self._all_image_feature_filenames = tf.gfile.Glob(image_feature_filepattern)
        self._image_name_to_token_ids = image_name_to_token_ids
        self._num_timesteps = num_timesteps
        self._indicator = 0
        self._image_feature_filenames = []
        self._image_feature_data = []
        self._load_image_feature_pickle()


    def _load_image_feature_pickle(self):
        for filename in self._all_image_feature_filenames:
            tf.logging.info("loading %s" % filename)
            with tf.gfile.GFile(filename, 'r') as f:
                filenames, features = cPickle.load(f)
                self._image_feature_filenames += filenames
                self._image_feature_data.append(features)
        self._image_feature_data = np.vstack(self._image_feature_data)
        self._image_feature_filenames = np.asarray(self._image_feature_filenames)
        print self._image_feature_data.shape
        print self._image_feature_filenames.shape


    def size(self):
        return len(self._image_feature_filenames)

    def image_feature_size(self):
        return self._image_feature_data.shape[1]

    def _random_shuffle(self):
        p = np.random.permutation(self.size())
        self._image_feature_filenames = self._image_feature_filenames[p]
        self._image_feature_data = self._image_feature_data[p]

    def _image_desc(self, filenames):
        batch_sentence_ids = []
        batch_weights = []
        for filename in filenames:
            token_ids_set = self._image_name_to_token_ids[filename]
            chosen_token_ids = random.choice(token_ids_set)
            chosen_token_length = len(chosen_token_ids)

            weight = [1 for i in range(chosen_token_length)]
            if chosen_token_length >= self._num_timesteps:
                chosen_token_ids = chosen_token_ids[0:self._num_timesteps]
                weight = weight[0:self._num_timesteps]
            else:
                remaining_length = self._num_timesteps - chosen_token_length
                chosen_token_ids += [self._vocab.eos for i in range(remaining_length)]
                weight += [0 for i in range(remaining_length)]
            batch_sentence_ids.append(chosen_token_ids)
            batch_weights.append(weight)
        batch_sentence_ids = np.asarray(batch_sentence_ids)
        batch_weights = np.asarray(batch_weights)
        return batch_sentence_ids, batch_weights

    def next(self, batch_size):
        if self._indicator + batch_size > self.size():
            self._random_shuffle()
            self._indicator = 0

        end_indicator = self._indicator + batch_size
        assert end_indicator <= self.size()

        batch_image_features = self._image_feature_data[self._indicator: end_indicator]
        batch_sentence_ids, batch_weights = self._image_desc(
            self._image_feature_filenames[self._indicator: end_indicator])

        self._indicator = end_indicator
        return batch_image_features, batch_sentence_ids, batch_weights

In [4]:
def get_default_params():
    return tf.contrib.training.HParams(
        num_vocab_word_threshold=5,
        num_embedding_nodes=16,
        num_timesteps=10,
        num_lstm_nodes=[32, 32],
        num_lstm_layers=2,
        num_fc_nodes=32,
        batch_size=5,
        cell_type='lstm',
        clip_lstm_grads=1.0,
        learning_rate=0.001,
        keep_prob=1.0,
        log_frequent=100,
        save_frequent=2000,
    )


In [5]:
def create_rnn_cell(hidden_dim, cell_type):
    if cell_type == 'lstm':
        return tf.contrib.rnn.BasicLSTMCell(hidden_dim, state_is_tuple=True)
    elif cell_type == 'gru':
        return tf.contrib.rnn.GRUCell(hidden_dim)
    else:
        raise Exception("%s has not been supported" % cell_type)

def dropout(cell, keep_prob):
    return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)

In [6]:
def create_model(hps, vocab_size, image_feature_dim):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size

    image_feature  = tf.placeholder(tf.float32, (batch_size, image_feature_dim))
    sentence = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    mask = tf.placeholder(tf.float32, (batch_size, num_timesteps))
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step', trainable=False)

    # Sets up the embedding layer.
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope('embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable(
            'embeddings',
            [vocab_size, hps.num_embedding_nodes],
            tf.float32)
        embed_token_ids = tf.nn.embedding_lookup(embeddings, sentence[:, 0:num_timesteps-1])
  
    image_feature_embed_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('image_feature_embed', initializer=image_feature_embed_init):
        embed_img = tf.layers.dense(image_feature, hps.num_embedding_nodes)
        embed_img = tf.expand_dims(embed_img, 1)
        embed_inputs = tf.concat([embed_img, embed_token_ids], axis=1)

    # Sets up LSTM network.
    scale = 1.0 / math.sqrt(
      hps.num_embedding_nodes + hps.num_lstm_nodes[-1]) / 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    with tf.variable_scope('lstm_nn', initializer=lstm_init):
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = create_rnn_cell(hps.num_lstm_nodes[i], hps.cell_type)
            cell = dropout(cell, keep_prob)
            cells.append(cell)
        cell = tf.contrib.rnn.MultiRNNCell(cells)

    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('fc', initializer=fc_init):
        state = cell.zero_state(hps.batch_size, tf.float32)
        cross_entropys = 0
        generated_words = []
        for i in range(hps.num_timesteps):
            if i > 0:
                tf.get_variable_scope().reuse_variables()
            embed_input = embed_inputs[:, i, :]
            embed_input = tf.reshape(embed_input, [batch_size, hps.num_embedding_nodes])
            # rnn_output: [batch_size, hps.num_lstm_node[-1]]
            rnn_output, state = cell(embed_input, state)
            fc1 = tf.layers.dense(rnn_output, hps.num_fc_nodes, name='fc1')
            fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
            fc1_dropout = tf.nn.relu(fc1_dropout)
            # logit: [batch_size, class_num]
            logit = tf.layers.dense(fc1_dropout, vocab_size, name='logit')
            max_prob_word = tf.argmax(logit, axis=1)
            max_prob_word = tf.expand_dims(max_prob_word, 1)
            generated_words.append(max_prob_word)
            word_label = sentence[:, i]
            word_mask = mask[:, i]
            cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logit, labels=word_label)
            cross_entropys += tf.reduce_sum(tf.multiply(cross_entropy, word_mask))

        loss = cross_entropys / tf.reduce_sum(mask)
        generated_words = tf.concat(generated_words, 1)


    with tf.variable_scope('train_op'):
        tvars = tf.trainable_variables()
        for var in tvars:
            tf.logging.info("variable name: %s" % (var.name))
        grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), hps.clip_lstm_grads)
        for grad, var in zip(grads, tvars):
            tf.summary.histogram('%s_grad' % (var.name), grad)
        optimizer = tf.train.AdamOptimizer(hps.learning_rate)
        train_op = optimizer.apply_gradients(zip(grads, tvars),
                                             global_step=global_step)

    return ((image_feature, sentence, mask, keep_prob),
            (loss, generated_words, train_op),
            global_step)


In [17]:
hps = get_default_params().parse(hp_config)


output_dir = output_dir
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

vocab = Vocab(vocab_file, hps.num_vocab_word_threshold)
vocab_size = vocab.size()
tf.logging.info("vocab_size: %d" % vocab_size)

image_name_to_tokens = parse_token_file(token_file)
image_name_to_token_ids = convert_token_to_id(image_name_to_tokens, vocab)

data = ImageCaptionData(image_name_to_token_ids,
                        input_filenamepatten,
                        hps.num_timesteps,
                        vocab)
image_feature_dim = data.image_feature_size()
tf.logging.info("image_feature_dim: %d" % image_feature_dim)



with tf.Graph().as_default():
    placeholders, metrics, global_step = create_model(hps, vocab_size, image_feature_dim)
    image_feature, sentence, mask, keep_prob = placeholders
    loss, generated_words, train_op = metrics
    summary_op = tf.summary.merge_all()
    init_op = tf.global_variables_initializer()
    saver = tf.train.Saver(max_to_keep=10)
    
    with tf.Session() as sess:
        sess.run(init_op)
        writer = tf.summary.FileWriter(output_dir, sess.graph)

        tf.logging.info("[*] Reading checkpoint ...")
        ckpt = tf.train.get_checkpoint_state(output_dir)
        if ckpt and ckpt.model_checkpoint_path:
            ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
            saver.restore(sess, os.path.join(output_dir, ckpt_name))
            tf.logging.info("[*] Success Read Checkpoint From %s" % (ckpt_name))
        else:
            tf.logging.info("[*] Failed load checkpoint")

        last_log_step = -1
        last_save_step = -1
        for i in range(100000):
            batch_image_features, batch_sentence_ids, batch_weights = data.next(hps.batch_size)
            input_vals = (batch_image_features, batch_sentence_ids, batch_weights, hps.keep_prob)
            feed_dict = dict(zip(placeholders, input_vals))

            should_log = last_log_step == -1 or (
                global_step_val - last_log_step >= hps.log_frequent)
            fetches = [global_step, loss, train_op, summary_op]

            if should_log:
                fetches += [generated_words]
            outputs = sess.run(fetches, feed_dict)
            global_step_val, loss_val = outputs[0:2]

            if should_log:
                summary_str, generated_words_val = outputs[3:]
                equal = (generated_words_val == batch_sentence_ids)
                weight_equal = equal * batch_weights
                accuracy = np.sum(weight_equal) / (np.sum(batch_weights) * 1.0)
                writer.add_summary(summary_str, global_step_val)
                tf.logging.info(
                    'Step: %5d, last_step: %5d, loss: %3.5f, word_accuracy: %3.5f' 
                    % (global_step_val, last_log_step, loss_val, accuracy))
                last_log_step = global_step_val
            should_save = last_save_step == -1 or (
                global_step_val - last_save_step >= hps.save_frequent)
            if should_save:
                if last_save_step != -1:
                    tf.logging.info("Step: %d, text classify model saved" 
                                    % (global_step_val))
                saver.save(sess, os.path.join(output_dir, "lstm"), 
                           global_step=global_step_val)
                last_save_step = global_step_val

INFO:tensorflow:vocab_size: 8186
INFO:tensorflow:loading features/image_features-0.pickle
INFO:tensorflow:loading features/image_features-1.pickle
(200, 1536)
(200,)
INFO:tensorflow:image_feature_dim: 1536
INFO:tensorflow:variable name: embedding/embeddings:0
INFO:tensorflow:variable name: image_feature_embed/dense/kernel:0
INFO:tensorflow:variable name: image_feature_embed/dense/bias:0
INFO:tensorflow:variable name: fc/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0
INFO:tensorflow:variable name: fc/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0
INFO:tensorflow:variable name: fc/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0
INFO:tensorflow:variable name: fc/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0
INFO:tensorflow:variable name: fc/fc1/kernel:0
INFO:tensorflow:variable name: fc/fc1/bias:0
INFO:tensorflow:variable name: fc/logit/kernel:0
INFO:tensorflow:variable name: fc/logit/bias:0
INFO:tensorflow:Summary name embedding/embeddings:0_grad is illegal; using embedding/embeddings_0_grad

INFO:tensorflow:Step: 12671, last_step: 12570, loss: 1.83333, word_accuracy: 0.58000
INFO:tensorflow:Step: 12772, last_step: 12671, loss: 1.62713, word_accuracy: 0.54000
INFO:tensorflow:Step: 12873, last_step: 12772, loss: 1.83426, word_accuracy: 0.54000
INFO:tensorflow:Step: 12974, last_step: 12873, loss: 1.57090, word_accuracy: 0.58000
INFO:tensorflow:Step: 13075, last_step: 12974, loss: 1.86097, word_accuracy: 0.58000
INFO:tensorflow:Step: 13176, last_step: 13075, loss: 1.74461, word_accuracy: 0.60000
INFO:tensorflow:Step: 13277, last_step: 13176, loss: 2.22008, word_accuracy: 0.46000
INFO:tensorflow:Step: 13378, last_step: 13277, loss: 1.77503, word_accuracy: 0.54000
INFO:tensorflow:Step: 13479, last_step: 13378, loss: 1.98910, word_accuracy: 0.44000
INFO:tensorflow:Step: 13580, last_step: 13479, loss: 2.00450, word_accuracy: 0.54000
INFO:tensorflow:Step: 13681, last_step: 13580, loss: 1.70365, word_accuracy: 0.58000
INFO:tensorflow:Step: 13782, last_step: 13681, loss: 1.73004, wor

INFO:tensorflow:Step: 22165, last_step: 22064, loss: 1.30394, word_accuracy: 0.64000
INFO:tensorflow:Step: 22266, last_step: 22165, loss: 1.22579, word_accuracy: 0.66000
INFO:tensorflow:Step: 22367, last_step: 22266, loss: 1.30650, word_accuracy: 0.60000
INFO:tensorflow:Step: 22468, last_step: 22367, loss: 1.11202, word_accuracy: 0.74000
INFO:tensorflow:Step: 22569, last_step: 22468, loss: 1.35233, word_accuracy: 0.66000
INFO:tensorflow:Step: 22670, last_step: 22569, loss: 1.18168, word_accuracy: 0.72000
INFO:tensorflow:Step: 22771, last_step: 22670, loss: 1.21955, word_accuracy: 0.72000
INFO:tensorflow:Step: 22872, last_step: 22771, loss: 1.12111, word_accuracy: 0.74000
INFO:tensorflow:Step: 22973, last_step: 22872, loss: 1.11485, word_accuracy: 0.76000
INFO:tensorflow:Step: 23074, last_step: 22973, loss: 1.25644, word_accuracy: 0.70000
INFO:tensorflow:Step: 23175, last_step: 23074, loss: 1.21342, word_accuracy: 0.72000
INFO:tensorflow:Step: 23276, last_step: 23175, loss: 1.20090, wor

KeyboardInterrupt: 