In [1]:
import tensorflow as tf
from tensorflow.contrib import layers

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import logging
from IPython import embed

In [3]:
class Seq2seq:
    def __init__(self, vocab_size, residual=True):
        self.residual = residual
        self.vocab_size = vocab_size

    def make_graph(self,mode, features, labels):
        embed_dim = 256
        num_units = 256

        input,output   = features['input'], features['output']
        batch_size     = tf.shape(input)[0]
        start_tokens   = tf.zeros([batch_size], dtype= tf.int64)
        train_output   = tf.concat([tf.expand_dims(start_tokens, 1), output], 1)
        input_lengths  = tf.reduce_sum(tf.to_int32(tf.not_equal(input, 1)), 1)
        output_lengths = tf.reduce_sum(tf.to_int32(tf.not_equal(train_output, 1)), 1)
        input_embed    = layers.embed_sequence(input, vocab_size=self.vocab_size, embed_dim = embed_dim, scope = 'embed')
        output_embed   = layers.embed_sequence(train_output, vocab_size=self.vocab_size, embed_dim = embed_dim, scope = 'embed', reuse = True)
        with tf.variable_scope('embed', reuse=True):
            embeddings = tf.get_variable('embeddings')
        cell = tf.contrib.rnn.LSTMCell(num_units=num_units)
        if self.residual:
            cell = tf.contrib.rnn.ResidualWrapper(cell)
        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(cell, input_embed, dtype=tf.float32)


        def decode(helper, scope, reuse=None):
            # Decoder is partially based on @ilblackdragon//tf_example/seq2seq.py
            with tf.variable_scope(scope, reuse=reuse):
                attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                    num_units=num_units, memory=encoder_outputs,
                    memory_sequence_length=input_lengths)
                cell = tf.contrib.rnn.LSTMCell(num_units=num_units)
                attn_cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=num_units / 2)
                out_cell = tf.contrib.rnn.OutputProjectionWrapper(attn_cell, self.vocab_size, reuse=reuse)
                decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=out_cell, helper=helper,
                    initial_state=out_cell.zero_state(
                        dtype=tf.float32, batch_size=batch_size))
                outputs = tf.contrib.seq2seq.dynamic_decode(
                    decoder=decoder, output_time_major=False,
                    impute_finished=True, maximum_iterations=30)
                return outputs[0]

        train_helper = tf.contrib.seq2seq.TrainingHelper(output_embed, output_lengths)
        pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, start_tokens=tf.to_int32(start_tokens), end_token=1)
        train_outputs = decode(train_helper, 'decode')
        pred_outputs  = decode(pred_helper, 'decode', reuse=True)

        tf.identity(train_outputs.sample_id[0], name='train_pred')
        weights = tf.to_float(tf.not_equal(train_output[:, :-1], 1))
        loss = tf.contrib.seq2seq.sequence_loss(train_outputs.rnn_output, output, weights=weights)
        train_op = layers.optimize_loss(
            loss, tf.train.get_global_step(),
            optimizer='Adam',
            learning_rate=0.001,
            summaries=['loss', 'learning_rate'])

        tf.identity(pred_outputs.sample_id[0], name='predict')
        return tf.estimator.EstimatorSpec(mode=mode, predictions=pred_outputs.sample_id, loss=loss, train_op=train_op)

In [4]:
class Data:
    def __init__(self, input_filename, output_filename, vocab_filename):
        self.input_filename = input_filename
        self.output_filename = output_filename
        self.vocab_filename = vocab_filename
        
        # create vocab and reverse vocab maps
        self.vocab     = {}
        self.rev_vocab = {}
        self.END_TOKEN = 1 
        self.UNK_TOKEN = 2
        self.FLIP = False
        with open(vocab_filename) as f:
            for idx, line in enumerate(f):
                self.vocab[line.strip()] = idx
                self.rev_vocab[idx] = line.strip()
        self.vocab_size = len(self.vocab)

    def tokenize_and_map(self,line):
        return [self.vocab.get(token, self.UNK_TOKEN) for token in line.split(' ')]

    def prepare(self,text):
        tokens = self.tokenize_and_map(text)
        input_length   = len(tokens)
        source = [tokens]
        source[0] += [self.END_TOKEN] * (input_length - len(source[0]))
        return source



    def single(self, sentence):
        tokens = self.tokenize_and_map(sentence)
        def input_fn():
            inp = tf.placeholder(tf.int64, shape=[None, None], name='input')
            output = tf.placeholder(tf.int64, shape=[None, None], name='output')
            tf.identity(inp[0], 'source')
            tf.identity(output[0], 'target')
            return { 'input': inp, 'output': output}, None
        def feed_fn():
            input_length   = len(tokens)
            source = [tokens]
            source[0] += [self.END_TOKEN] * (input_length - len(source[0]))
            # this source is not used to compute anything, just so that placeholder does not complain about
            # missing values for target during prediction
            self.FLIP = not self.FLIP
            if not self.FLIP:
                raise StopIteration

            return { 'input:0': source, 'output:0': source }
        return input_fn, feed_fn

    def make_input_fn(self):
        def input_fn():
            inp = tf.placeholder(tf.int64, shape=[None, None], name='input')
            output = tf.placeholder(tf.int64, shape=[None, None], name='output')
            tf.identity(inp[0], 'source')
            tf.identity(output[0], 'target')
            return { 'input': inp, 'output': output}, None

        def sampler():
            while True:
                with open(self.input_filename) as finput, open(self.output_filename) as foutput:
                    for source,target in zip(finput, foutput):
                        yield {
                            'input': self.tokenize_and_map(source)[:30 - 1] + [self.END_TOKEN],
                            'output': self.tokenize_and_map(target)[:30 - 1] + [self.END_TOKEN]}

        data_feed = sampler()
        def feed_fn():
            source, target = [], []
            input_length, output_length = 0, 0
            for i in range(32):
                rec = data_feed.__next__()
                source.append(rec['input'])
                target.append(rec['output'])
                input_length = max(input_length, len(source[-1]))
                output_length = max(output_length, len(target[-1]))
            for i in range(32):
                source[i] += [self.END_TOKEN] * (input_length - len(source[i]))
                target[i] += [self.END_TOKEN] * (output_length - len(target[i]))
            return { 'input:0': source, 'output:0': target }
        return input_fn, feed_fn

    def get_formatter(self,keys):
        def to_str(sequence):
            tokens = [
                self.rev_vocab.get(x, "<UNK>") for x in sequence]
            return ' '.join(tokens)

        def format(values):
            res = []
            for key in keys:
                res.append("****%s == %s" % (key, to_str(values[key]).replace('</S>','').replace('<S>', '')))
            return '\n'+'\n'.join(res)
        return format

In [5]:
class Predict:
    def __init__(self, checkpoint='checkpoint', directory='coco'):
        self.data  = Data(directory + '/train_source.txt', 
                          directory + '/train_target.txt', 
                          directory + '/train_vocab.txt')
        model = Seq2seq(self.data.vocab_size)
        estimator = tf.estimator.Estimator(model_fn=model.make_graph, model_dir=checkpoint)
        def input_fn():
            inp = tf.placeholder(tf.int64, shape=[None, None], name='input')
            output = tf.placeholder(tf.int64, shape=[None, None], name='output')
            tf.identity(inp[0], 'source')
            tf.identity(output[0], 'target')
            dict =  { 'input': inp, 'output': output}
            return tf.estimator.export.ServingInputReceiver(dict, dict)
        self.predictor = tf.contrib.predictor.from_estimator(estimator, input_fn)

    def infer(self, sentence):
        input = self.data.prepare(sentence)
        predictor_prediction = self.predictor({"input": input, "output":input})
        words = [self.data.rev_vocab.get(i, '<UNK>') for i in predictor_prediction['output'][0] if i > 2]
        return ' '.join(words)


In [14]:
data  = Data('coco/train_source.txt', 'coco/train_target.txt', 'coco/train_vocab.txt')
model = Seq2seq(data.vocab_size)

input_fn, feed_fn = data.make_input_fn()
print_inputs = tf.train.LoggingTensorHook( ['source', 'target', 'predict'], every_n_iter=100, 
        formatter=data.get_formatter(['source', 'target', 'predict']))

estimator = tf.estimator.Estimator(model_fn=model.make_graph, model_dir='checkpoint_coco')#, params=FLAGS)
estimator.train(input_fn=input_fn, hooks=[tf.train.FeedFnHook(feed_fn), print_inputs], steps=20000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'checkpoint_coco', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff7d85f7510>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done ca

INFO:tensorflow:loss = 2.3881295, step = 12000 (2.737 sec)
INFO:tensorflow:global_step/sec: 36.3664
INFO:tensorflow:
****source == park benches with water flooded and grass with <UNK>           
****target == three green metal benches showing above a flooded <UNK>           
****predict == a bench that is sitting on a <UNK>    
INFO:tensorflow:loss = 2.3982341, step = 12100 (2.750 sec)
INFO:tensorflow:global_step/sec: 34.8428
INFO:tensorflow:
****source == the motorcycle officer talks to a man near a school <UNK>     
****target == police officers on a road by a yellow <UNK>       
****predict == a man riding a bike down a <UNK>     
INFO:tensorflow:loss = 2.3398442, step = 12200 (2.871 sec)
INFO:tensorflow:global_step/sec: 35.6483
INFO:tensorflow:
****source == a yellow and blue bus driving down a city <UNK>     
****target == the city bus is painted yellow and blue with a flower on <UNK>  
****predict == a bus is driving down the street in the <UNK>   
INFO:tensorflow:loss = 2.466621

INFO:tensorflow:
****source == there are a lot of elephants walking on the <UNK>          
****target == a number of elephants walking in a field of green <UNK>         
****predict == a group of elephants walking across a dirt <UNK>    
INFO:tensorflow:loss = 2.0636544, step = 14600 (2.888 sec)
INFO:tensorflow:global_step/sec: 34.4691
INFO:tensorflow:
****source == a garden has flowers and teddy bear <UNK>         
****target == it looks like these bears are no longer wanted by <UNK>      
****predict == a bunch of flowers and a stuffed <UNK>     
INFO:tensorflow:loss = 2.5320826, step = 14700 (2.901 sec)
INFO:tensorflow:global_step/sec: 35.4339
INFO:tensorflow:
****source == a brown dog with its tongue out with an <UNK>        
****target == a dog sitting with his mouth open and a dog collar <UNK>      
****predict == a dog with a dog in its mouth with a <UNK>   
INFO:tensorflow:loss = 2.1660576, step = 14800 (2.822 sec)
INFO:tensorflow:global_step/sec: 33.6507
INFO:tensorflow:
****s

INFO:tensorflow:global_step/sec: 34.2203
INFO:tensorflow:
****source == a man driving a car and eating a <UNK>      
****target == a man attempts to eat a pastry while <UNK>          
****predict == a man is eating a doughnut with a <UNK>    
INFO:tensorflow:loss = 2.075069, step = 17100 (2.921 sec)
INFO:tensorflow:global_step/sec: 33.9759
INFO:tensorflow:
****source == some people are standing near a fast <UNK>                  
****target == a number of surfers near a river with a crowd <UNK>               
****predict == a man standing next to a man holding a <UNK>   
INFO:tensorflow:loss = 2.4221299, step = 17200 (2.943 sec)
INFO:tensorflow:global_step/sec: 34.1858
INFO:tensorflow:
****source == the workers had a bunch of donuts on the <UNK>     
****target == a man and a woman working in a <UNK>      
****predict == a man is holding a bunch of doughnuts on a <UNK> 
INFO:tensorflow:loss = 1.8384401, step = 17300 (2.925 sec)
INFO:tensorflow:global_step/sec: 34.4775
INFO:tensorflow:


INFO:tensorflow:loss = 2.2248015, step = 19500 (2.994 sec)
INFO:tensorflow:global_step/sec: 32.9907
INFO:tensorflow:
****source == a multitude of stuffed plush teddy bear <UNK>                      
****target == a pile of teddy bears with price <UNK>                      
****predict == a pile of stuffed animals sitting on top of a <UNK> 
INFO:tensorflow:loss = 2.3979166, step = 19600 (3.031 sec)
INFO:tensorflow:global_step/sec: 33.4477
INFO:tensorflow:
****source == a large <UNK> rectangular clock is surrounded by a forest of <UNK>    
****target == a clock tower out in the middle of nowhere <UNK>      
****predict == a clock is sitting on a brick <UNK>       
INFO:tensorflow:loss = 1.8435799, step = 19700 (2.990 sec)
INFO:tensorflow:global_step/sec: 34.2696
INFO:tensorflow:
****source == a man wearing a helmet eats a slice of <UNK>     
****target == the man in the red helmet is biting a slice of <UNK>   
****predict == a man that is sitting down with a <UNK>   
INFO:tensorflow:loss

INFO:tensorflow:global_step/sec: 34.0758
INFO:tensorflow:
****source == a man sitting on a park bench holding <UNK>             
****target == a man sitting on a bench holding a piece of the <UNK>          
****predict == a man sitting on a bench holding a <UNK>    
INFO:tensorflow:loss = 2.6271336, step = 22100 (2.936 sec)
INFO:tensorflow:global_step/sec: 34.457
INFO:tensorflow:
****source == a stoplight hanging in front of a <UNK>               
****target == a close up of a stop light positioned against a high rise <UNK>          
****predict == a traffic light hanging from a street <UNK>    
INFO:tensorflow:loss = 2.094672, step = 22200 (2.901 sec)
INFO:tensorflow:global_step/sec: 35.9824
INFO:tensorflow:
****source == the people are sitting on a bench that looks like a <UNK>     
****target == there's not much danger that anyone will steal this <UNK>       
****predict == a group of people sitting on a bench next to a <UNK> 
INFO:tensorflow:loss = 2.1575305, step = 22300 (2.779 se

INFO:tensorflow:loss = 2.500101, step = 24500 (2.869 sec)
INFO:tensorflow:global_step/sec: 33.574
INFO:tensorflow:
****source == a man riding on top of an elephant on a dirt <UNK>         
****target == an elephant with a seating area on top standing by a loading <UNK>        
****predict == a man riding an elephant on a <UNK>     
INFO:tensorflow:loss = 1.9907562, step = 24600 (2.978 sec)
INFO:tensorflow:global_step/sec: 34.3361
INFO:tensorflow:
****source == a woman with an umbrella walking past a building with <UNK>          
****target == a woman with an umbrella walking by an artist's <UNK>           
****predict == a woman walking down a street with an <UNK>        
INFO:tensorflow:loss = 2.4374924, step = 24700 (2.912 sec)
INFO:tensorflow:global_step/sec: 35.2497
INFO:tensorflow:
****source == a person in white shirt standing by building with <UNK>          
****target == people carrying umbrellas make orders at a walk up <UNK>          
****predict == a man standing in front of

INFO:tensorflow:loss = 1.9089303, step = 27000 (2.981 sec)
INFO:tensorflow:global_step/sec: 34.293
INFO:tensorflow:
****source == a man riding a skateboard while covered in <UNK>     
****target == a man crouching on a skateboard is <UNK>      
****predict == a man is riding a skateboard in the <UNK>     
INFO:tensorflow:loss = 2.204521, step = 27100 (2.916 sec)
INFO:tensorflow:global_step/sec: 34.5775
INFO:tensorflow:
****source == a person is holding a donut in front of a woman 's <UNK>       
****target == happy woman showing off red frosted doughnut in serving <UNK>          
****predict == a woman holding a donut in front of a woman in a <UNK> 
INFO:tensorflow:loss = 2.1928823, step = 27200 (2.892 sec)
INFO:tensorflow:global_step/sec: 34.1345
INFO:tensorflow:
****source == a man in black jacket standing on a snowboard next to <UNK>           
****target == there is a man standing on a snow board in the snow <UNK>          
****predict == a man on a snowboard stands on a snowy <UNK

INFO:tensorflow:loss = 2.295193, step = 29500 (2.987 sec)
INFO:tensorflow:global_step/sec: 33.3324
INFO:tensorflow:
****source == a lady on a phone sitting on a <UNK>        
****target == the lovely young lady sits on the couch and holds a cell phone to her <UNK> 
****predict == a woman sitting on a bench talking on her <UNK>    
INFO:tensorflow:loss = 2.0387418, step = 29600 (3.000 sec)
INFO:tensorflow:global_step/sec: 32.6774
INFO:tensorflow:
****source == a woman strolls along a busy street while on her <UNK>               
****target == a lady walking and talking on a cell phone of some <UNK>              
****predict == a woman walking down a street next to a <UNK>    
INFO:tensorflow:loss = 2.017858, step = 29700 (3.060 sec)
INFO:tensorflow:global_step/sec: 34.0393
INFO:tensorflow:
****source == a close shot of a vegetable pizza ready to be <UNK>          
****target == the pizza on the table has many toppings of meat and <UNK>         
****predict == a pizza with a lot of toppi

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x7ff7d85f7b10>

In [15]:
P = Predict(checkpoint='checkpoint_coco', directory='coco')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'checkpoint_coco', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff734fd1210>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done ca

In [23]:
P.infer('kitchen stove  sink  and counter with stuff on it')

'a kitchen with a stove and microwave oven on top of'

In [26]:
P.infer('graffiti ed stop sign across the street from a red car')

'a red stop sign with a red stop sign on'