In [2]:
from Utils.FS import file
from Utils.tensorflow_helper import show_graph
from nltk.corpus import brown
from sklearn.preprocessing import normalize
import numpy as np
import tensorflow as tf
import math
import TextPreprocess.words2dict as words2dict
from tensorflow.python.layers import core as layers_core
from tensorflow.python.client import timeline

In [3]:
np.random.seed(1234)

In [4]:
sents = brown.sents()
#sents = [[token.lower() for token in sent] for sent in sents]
words = brown.words()
#words = [word.lower() for word in words]

In [5]:
# input list of iteratable sequence
# assumption 1: the unique tokens isn't large, use stemming / tolower if needed

In [6]:
print([i for i in words[0]])
print([i for i in sents[0]])

['T', 'h', 'e']
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [70]:
from functools import lru_cache
from collections import Counter

# Sequences is an iterable of sequence, sequence is an iterable of symbol
# This class build a lookup table for mapping symbol to an index and vice versa
# By default, this lookup table sort the frequency of symbol in descending order.
# This is required for using, for example, nce_loss function of tensorflow use uni-log sampling.

class Sequences:
    def __init__(self, sequences=[], verbose=False):
        # assert sequences is iterable and each sequence is also iterable
        self.seqs = sequences
        self.verbose = verbose
        self.build_dicts()
        if self.verbose:
            self.describe()
                
    @lru_cache(maxsize=None)
    def max_length(self):
        return max([len(seq) for seq in self.seqs])
    
    # build forward and backward lookup dictionaries for symbols to indices
    def build_dicts(self):
        if self.verbose:
            print("Building dictionaries...")
        
        counter = Counter([symbol for seq in self.seqs for symbol in seq])
        self.sym2Idx = {}
        index = 0
        for item in sorted(counter.items(), key=lambda x: (x[1], x[0]), reverse=True):
            self.sym2Idx[item[0]] = index
            index += 1

        self.UNK = '<UNK>'
        self.sym2Idx[self.UNK] = index
        index += 1

        self.PAD = '<PAD>'
        self.sym2Idx[self.PAD] = index
        index += 1

        self.GO = '<GO>'
        self.sym2Idx[self.GO] = index
        index += 1
            
        self.idx2Sym = { v:k for k, v in self.sym2Idx.items()}
        self.dict_size = len(self.idx2Sym)
        
    def batchPadding(self, batch):
        size = max([len(record) for record in batch])
        result = np.full((len(batch), size), self.sym2Idx[self.PAD])
        for i in range(len(batch)):
            result[i][:len(batch[i])] = batch[i]
        return result

    def batchMask(self, batch):
        size = max([len(record) for record in batch])
        result = np.full((len(batch), size), 0.0)
        for i in range(len(batch)):
            result[i][:len(batch[i])] = 1.0
        return result
        
    def generator(self, batch_size, epouch):
            train = []
            length = []
            while(epouch < 0 or epouch > 0):
                for seq in self.seqs:
                    train.append([self.sym2Idx[sym] for sym in seq])
                    length.append(len(seq))
                    if(len(train) == batch_size):
                        yield self.batchPadding(train), length, self.batchMask(train)
                        train = []
                        length = []
                epouch -= 1
                print('epouch done...')
        
    def getGenerator(self, batch_size=32, epouch=-1):
        return self.generator(batch_size, epouch)
    
    def describe(self):
        print("Number of sequences: {}".format(len(self.seqs)))
        print("Longest sequence length: {}".format(self.max_length()))
        total_sym = len(self.idx2Sym)
        print("Distinct symbols: {}".format(total_sym))
        print("Top {0} most frequent symbols: {1}".format(min(total_sym, 10), [self.idx2Sym[i] for i in range(min(total_sym, 10))] ))
        print("Top {0} least frequent symbols: {1}".format(min(total_sym, 10), [self.idx2Sym[len(self.idx2Sym) - i - 1] for i in range(min(total_sym, 10))] ))
        print("Special Symbols: {}, {}, {}".format(self.GO, self.PAD, self.UNK))
        print("First batch:\n{}".format(next(self.getGenerator(3, 1))))

In [71]:
class RNNEmbedding:
    def __init__(self, sequences, encoder, decoder):
        assert isinstance(sequences, Sequences)
        self.SEQS = sequences

In [72]:
seqs = Sequences(words, verbose=True)

Building dictionaries...
Number of sequences: 1161192
Longest sequence length: 33
Distinct symbols: 86
Top 10 most frequent symbols: ['e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l']
Top 10 least frequent symbols: ['<GO>', '<PAD>', '<UNK>', '+', '[', ']', '{', '}', 'X', 'Z']
Special Symbols: <GO>, <PAD>, <UNK>
First batch:
(array([[26,  8,  0, 84, 84, 84],
       [45, 12,  9,  1,  3,  5],
       [33,  3, 12,  5,  1, 18]]), [3, 6, 6], array([[ 1.,  1.,  1.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.]]))


In [69]:
seqs2 = Sequences(sents, verbose=True)

Building dictionaries...
Number of sequences: 57340
Longest sequence length: 180
Distinct symbols: 56060
Top 10 most frequent symbols: ['the', ',', '.', 'of', 'and', 'to', 'a', 'in', 'that', 'is']
Top 10 least frequent symbols: ['<GO>', '<PAD>', '<UNK>', '$.054/mbf', '$.07/cwt', '$.076', '$.09', '$.10-a-minute', '$.105', '$.12']
Special Symbols: <GO>, <PAD>, <UNK>
First batch:
(array([['14', '5859', '1295', '5596', '17467', '59', '1885', '34', '2608',
        '3', '17773', '595', '1168', '1531', '1211', '12', '67', '477',
        '13', '8', '84', '10003', '220', '188', '2', '<PAD>', '<PAD>',
        '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>',
        '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>',
        '<PAD>', '<PAD>'],
       ['14', '1764', '505', '59', '7', '32344', '35546', '8', '0', '769',
        '9578', '1246', '1', '35', '25', '3190', '873', '3', '0', '1531',
        '1', '12', '6080', '0', '5679', '4', '3923', '3', '0', '769', '3',
        

In [16]:
embedding = RNNEmbedding(seqs)

In [None]:
class RNNEncoder:
    def __init__(self, sizes = [10, 10]):
        self.sizes = sizes
        
        

In [None]:
#TRAIN_LENGTH = 4
#WINDOW_SIZE = 2
BATCH_SIZE = 64

In [None]:
ENCODER_DIMENSION = [50, 150, 150, 150]
ENCODER_LAYERS = len(ENCODER_DIMENSION)

RNN_DIMENSION = [150, 150, 150, 50]
RNN_LAYERS = len(RNN_DIMENSION)

DIMENSION = 50
NEGATIVE_SAMPLE = 256
SOFTMAX_SAMPLE = int(np.log2(VOCABULAY_SIZE)) + 64
MODE = 'train'

print(SOFTMAX_SAMPLE)

In [None]:
graph = tf.Graph()

with graph.as_default():
    
        #IN
        inputs = tf.placeholder(tf.int32, (None, None), name = "Input_Sentence_Word_Index")
        #OUT: (batch, time) int32
        
        batch_size = tf.shape(inputs)[0]
        steps = tf.shape(inputs)[1]
        
        #IN
        input_lengths = tf.placeholder(tf.int32, (None), name = "Input_Sentence_Length")
        #OUT: (batch) int32
                
        batch_size = tf.shape(inputs)[0]
        steps = tf.shape(inputs)[1]
        
        encoder_inputs = inputs
        #OUT: (batch, time) int32
        
        #decoder_inputs = tf.pad(tf.slice(encoder_inputs, [0,1], [batch_size, steps-1]), [[1, 0], [0, 0]])
        decoder_inputs = tf.pad(
            tf.slice(encoder_inputs, [0,0], [batch_size, steps-1]) - GO_SYMBOL, 
            [[0, 0], [1, 0]]
        ) + GO_SYMBOL
        #OUT: (batch, time) int32
        
        decoder_input_lengths = tf.reshape(input_lengths, [batch_size])
        #OUT: (batch) int32

        # assume same input length
        decoder_masks = tf.placeholder(tf.float32, (None, None), name = "Input_Sentence_Mask")
        #OUT: (batch, time)
        
        #labels = tf.placeholder(tf.int32, (None, 1), name = "Context_Word_Index")
        #OUT: (batch, 1) int32
        

        embeddings = tf.Variable(tf.random_uniform([VOCABULAY_SIZE, DIMENSION], -1.0, 1.0), trainable=False, name="Word2Vec")
        
        #IN: (batch, time) int32
        inputs_embed = tf.nn.embedding_lookup(embeddings, inputs, max_norm=1)
        #OUT: (batch, time, dim) float32
        decoder_embed = tf.nn.embedding_lookup(embeddings, decoder_inputs, max_norm=1)

        """
        #IN: (batch, time, dim) 
        #weights = [tf.Variable(tf.random_uniform([DIMENSION * 2, DIMENSION], minval=-1, maxval=1)) for i in range(RNN_LAYERS)]
        #bias = [tf.Variable(tf.random_uniform([DIMENSION], minval=-1, maxval=1)) for i in range(RNN_LAYERS)]
        
        #nn_input = tf.reshape(inputs_embed, (batch_size, DIMENSION))
        #nn_output = tf.ones((batch_size, DIMENSION))
        #for i in range(RNN_LAYERS):
        #    nn_output = tf.concat([nn_output, nn_input], 1)
        #    nn_output = tf.tanh(tf.matmul(nn_output, weights[i]) + bias[i])
        #out: (batch, DIMENSION)

        #IN: (batch, time, dim) float32
        #rnn_inputs = tf.transpose(inputs_embed, [1, 0, 2])
        #rnn_inputs = inputs_embed
        #OUT: (batch, time, dim) float32
        
        #cell = tf.contrib.rnn.LSTMCell(DIMENSION)
        cell = tf.contrib.rnn.GRUCell(DIMENSION)
        
        outputs, states = tf.contrib.legacy_seq2seq.embedding_rnn_seq2seq(
            encoder_inputs = encoder_inputs,
            decoder_inputs = decoder_inputs,
            cell=cell,
            num_encoder_symbols = VOCABULAY_SIZE,
            num_decoder_symbols = VOCABULAY_SIZE,
            embedding_size = DIMENSION,
            output_projection=(
                tf.Variable(tf.random_iniform([DIMENSION, VOCABULAY_SIZE], minval=-0.1 , maxval=0.1)),
                tf.Variable(tf.random_iniform([VOCABULAY_SIZE], minval=-0.1 , maxval=0.1))
            ),
            feed_previous=False,
        )
        
        loss = tf.contrib.legacy_seq2seq.sequence_loss(
            logits=outputs,
            targets=labels,
            weights,
            average_across_timesteps=True,
            average_across_batch=True,
            softmax_loss_function=None,
            name=None
        )
        """
        # OUT: [time [batch_size, DIMENSION]]
        
        
        ###### IN: (batch, time, DIMENSION) float32 ######
        
        with tf.variable_scope("Encoder") as encoder_scope:

            #cell = tf.contrib.rnn.LSTMCell(RNN_DIMENSION[-1])
            #cell = tf.contrib.rnn.GRUCell(ENCODER_DIMENSION) 
            cells = [tf.contrib.rnn.GRUCell(ENCODER_DIMENSION[i]) for i in range(ENCODER_LAYERS)]
            stack = tf.contrib.rnn.MultiRNNCell(cells)

            #rnn_tuple_state = tuple([tf.nn.rnn_cell.LSTMStateTuple(l[i][0], l[i][1]) for i in range(RNN_LAYERS)])

            #cell = tf.contrib.rnn.LSTMCell(DIMENSION, state_is_tuple=True)        
            #cell = tf.contrib.rnn.GRUCell(DIMENSION)
            #initial_state = cell.zero_state(batch_size, tf.float32)

            rnn_outputs, rnn_states = tf.nn.dynamic_rnn(stack, inputs_embed, dtype=tf.float32, sequence_length=input_lengths)
            
            #IN: (batch, time, RNN_DIMENSION[-1]) float32
            index = tf.range(0, batch_size) * tf.shape(inputs)[1] + (input_lengths - 1)
            rnn_final_state = tf.gather(tf.reshape(rnn_outputs, [-1, ENCODER_DIMENSION[-1]]), index)
            rnn_final_state = tf.reshape( rnn_final_state, [-1, ENCODER_DIMENSION[-1]])
            #rnn_final_state = tf.clip_by_norm(rnn_final_state, 1, axes=[1])
            ###rnn_final_state = tf.gather(tf.reshape(rnn_outputs, [-1, DIMENSION]), index)
            #OUT: (batch, RNN_DIMENSION[-1])
            print(rnn_outputs)
            print(rnn_final_state)
                    
        ###### OUT: (batch, time, RNN_DIMENSION[-1]) float32 ######
        
        
        ###### IN: (batch, time, RNN_DIMENSION[-1]) float32 ######
        
        with tf.variable_scope("Decoder") as decoder_scope:
            #cell = tf.contrib.rnn.LSTMCell(RNN_DIMENSION[-1])
            cells = [tf.contrib.rnn.GRUCell(RNN_DIMENSION[i]) for i in range(RNN_LAYERS)]
            stack = tf.contrib.rnn.MultiRNNCell(cells)
            
            if MODE == "train":
                helper = tf.contrib.seq2seq.TrainingHelper(
                    inputs=decoder_embed,
                    sequence_length=decoder_input_lengths)
                
            elif MODE == "infer":
                helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                    embedding=embedding,
                    start_tokens=tf.tile([GO_SYMBOL], [batch_size]),
                    end_token=END_SYMBOL)
                
            decoder_init_state = [cell.zero_state(batch_size, tf.float32) for cell in cells]
            decoder_init_state[0] = rnn_final_state
            decoder_init_state = tuple(decoder_init_state)
            
            decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=stack,
                helper=helper,
                initial_state=decoder_init_state,
                #output_layer=layers_core.Dense(VOCABULAY_SIZE, use_bias=True, activation=None))
                output_layer=None)
            #sequence_loss has softmax already

            decoder_outputs, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=decoder,
                output_time_major=False,
                impute_finished=False,
                maximum_iterations=None,
                parallel_iterations=32,
                swap_memory=False,
                scope=None
            )
            
            nce_weights = tf.Variable(
            tf.truncated_normal([VOCABULAY_SIZE, RNN_DIMENSION[-1]],
                                stddev=1.0 / math.sqrt(RNN_DIMENSION[-1])), trainable=False)

            nce_biases = tf.Variable(tf.zeros([VOCABULAY_SIZE]), trainable=False)
        
            def sample_softmax_loss(labels, inputs):
                return tf.nn.sampled_softmax_loss(
                        weights = nce_weights,
                        biases = nce_biases,
                        labels = tf.reshape(labels, [-1, 1]),
                        inputs = inputs,
                        num_sampled=SOFTMAX_SAMPLE,
                        num_classes=VOCABULAY_SIZE,
                        num_true=1,
                        sampled_values=None,
                        remove_accidental_hits=True,
                    )
        
            def nce_loss(labels, inputs):
                return tf.nn.nce_loss(
                    weights=nce_weights,
                    biases=nce_biases,
                    labels=tf.reshape(labels, [-1, 1]),
                    inputs=inputs,
                    num_sampled=NEGATIVE_SAMPLE,
                    num_classes=VOCABULAY_SIZE)
        
            seq_loss = tf.contrib.seq2seq.sequence_loss(
                logits=decoder_outputs.rnn_output,
                targets=inputs,
                weights=decoder_masks,
                average_across_timesteps=False, # make sure you handled decoder_masks and optimizer and average loss
                average_across_batch=False,
                #softmax_loss_function=nce_loss,
                softmax_loss_function=sample_softmax_loss,
                name=None
            )
            
            max_loss = tf.reduce_max(seq_loss)
            loss = tf.reduce_mean(seq_loss)
            
            
            project_input = tf.placeholder(tf.float32, (None, DIMENSION), name = "Project_Inputs")
            project = tf.nn.softmax(project_input @ tf.transpose(nce_weights) + nce_biases)
            

            """
            helper = tf.contrib.seq2seq.TrainingHelper(decoder_inputs, decoder_input_lengths)
            decoder = tf.contrib.seq2seq.BasicDecoder(cell, helper, rnn_final_state)
        
            decoder_outputs, decoder_states = tf.contrib.seq2seq.dynamic_decode(decoder=decoder)
            print(decoder_outputs)
            #rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs_embed, initial_state=rnn_final_state, sequence_length=input_lengths)
            
            #weights: A 2D Tensor of shape [batch_size x sequence_length] and dtype float. Weights constitutes the weighting of each prediction in the sequence. When using weights as masking set all valid timesteps to 1 and all padded timesteps to 0.
            loss = tf.reduce_mean(tf.contrib.seq2seq.sequence_loss(
                logits=decoder_outputs,
                targets=labels,
                weights=decoder_masks,
                average_across_timesteps=True,
                average_across_batch=True,
                softmax_loss_function=None,
                name=None
            ))
            """

        #optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        #optimizer = tf.train.MomentumOptimizer(1.0, 0.5).minimize(loss)
        optimizer = tf.train.AdamOptimizer().minimize(seq_loss )
        
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        #embeddings_saver = tf.train.Saver({'Words2Vec': embeddings})
        word2VecSaver = tf.train.Saver({'Words2Vec': embeddings, 'NCE_Weights': nce_weights, 'NCE_Biases': nce_biases})
        #context = tf.nn.softmax(tf.matmul(rnn_final_state, tf.transpose(nce_weights)) + nce_biases)

In [None]:
#show_graph(graph.as_graph_def())

In [None]:
num_steps = 2000000
MODEL = './model/seq2seq-autoencoder.ckpt'
WORDS2VEC_MODEL = './model/brown-Words2Vec-{}.ckpt'.format(DIMENSION)

In [None]:
def cloestWord(vec, words_vec, count=10, method='cos'):
    if method == 'cos':
        dist = np.array([ sum(vec * words_vec[key]) for key in words_vec.keys()])
        top_ten = dist.argsort()[::-1][:10]
    else:
        dist = np.array([ sum(np.square(np.array(vec) - np.array(words_vec[key]))) for key in words_vec.keys()])
        top_ten = dist.argsort()[:10]
    return [list(words_vec.keys())[i] for i in top_ten]

def cloestWord2(word, emb, count=10, method='cos'):
    return cloestWord3(emb[words_dict[word]], emb, count, method)

def cloestWord3(vec, emb, count=10, method='cos'):
    if method == 'cos':
        dist = np.array([ sum(vec * emb[i]) for i in range(emb.shape[0])])
        # dist: word index -> dist
        
        top = dist.argsort()[::-1][:count]
        # top: ranking -> word index
        
    return [(inv_words_dict[i], "%.2f" % dist[i])  for i in top]

def to_word_indices(words):
    return [words_dict[word] for word in words]

In [None]:
DEBUG_SIZE = 100

with tf.Session(graph=graph) as session:
    init.run()
    word2VecSaver.restore(session, WORDS2VEC_MODEL)
    #saver.restore(session, MODEL)
    
    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
    run_metadata = tf.RunMetadata()

      
    average_loss = 0
    average_max_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_input_lengths, batch_masks = next(generator)            
        feed_dict = {inputs: batch_inputs, input_lengths: batch_input_lengths, decoder_masks: batch_masks}

        #_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata)
        _, loss_val, max_loss_val = session.run([optimizer, loss, max_loss], feed_dict=feed_dict)
        average_loss += loss_val
        average_max_loss += max_loss_val
        

        if step % DEBUG_SIZE == 0:
            if step > 0:
                average_loss /= DEBUG_SIZE
                average_max_loss /= DEBUG_SIZE
                print('Average mean loss at step ', step, ': ', average_loss)
                print('Average max loss at step ', step, ': ', average_max_loss)
                average_loss = 0
                average_max_loss = 0
                
                feed_dict = {inputs: [
                    to_word_indices(['once', 'upon', 'a', 'time', PAD, PAD, PAD]), 
                    to_word_indices(['this', 'cat', 'is', 'cute', PAD, PAD, PAD]),
                    to_word_indices(['the', 'weather', 'today', 'is', 'pretty', 'great', '.'])
                ], input_lengths:[4, 4, 7], decoder_masks: [[1,1,1,1,0,0,0], [1,1,1,1,0,0,0], [1,1,1,1,1,1,1]]}

                a, b, c, d = session.run([decoder_outputs.rnn_output, seq_loss, loss, max_loss], feed_dict)
                print('encoder_input: ', [inv_words_dict[out] for out in feed_dict[inputs][0]])
                print('decoder_output: ', [inv_words_dict[word.argmax()] for word in project.eval({
                    project_input: a[0]
                })])
                print('encoder_input: ', [inv_words_dict[out] for out in feed_dict[inputs][1]])
                print('decoder_output: ', [inv_words_dict[word.argmax()] for word in project.eval({
                    project_input: a[1]
                })])
                print('encoder_input: ', [inv_words_dict[out] for out in feed_dict[inputs][2]])
                print('decoder_output: ', [inv_words_dict[word.argmax()] for word in project.eval({
                    project_input: a[2]
                })])
                print(b)
                print(c)
                print(d)

                
        if step % DEBUG_SIZE == 0:
            save_path = saver.save(session, MODEL)
            print("Model saved in file: %s" % save_path)
            
            # Create the Timeline object, and write it to a json
            tl = timeline.Timeline(run_metadata.step_stats)
            ctf = tl.generate_chrome_trace_format()
            with open('timeline.json', 'w') as f:
                f.write(ctf)


In [None]:
with tf.Session(graph=graph) as session:
    saver.restore(session, MODEL)
    generator = sentenceGenerator(sents, words_dict, batch_size=2)
    batch_inputs, batch_input_lengths, batch_masks = next(generator)
    
    feed_dict = {
        inputs: batch_inputs,
        input_lengths: batch_input_lengths,
        decoder_masks: batch_masks
    }
    
    feed_dict = {inputs: [
        to_word_indices(['once', 'upon', 'a', 'time']), 
        to_word_indices(['this', 'cat', 'is', 'cute'])
    ], input_lengths:[4, 4], decoder_masks: [[1, 1, 1, 1], [1,1,1,1]]}
    
    a, b, c = session.run([decoder_outputs.rnn_output, seq_loss, loss], feed_dict)
    print('encoder_input: ', [inv_words_dict[out] for out in feed_dict[inputs][0]])
    print('decoder_output: ', [inv_words_dict[word.argmax()] for word in project.eval({
        project_input: a[0]
    })])
    print('encoder_input: ', [inv_words_dict[out] for out in feed_dict[inputs][1]])
    print('decoder_output: ', [inv_words_dict[word.argmax()] for word in project.eval({
        project_input: a[1]
    })])
    print(b)
    print(c)
    
    #print(a[0][0].argsort())
    #print('decoder_outputs: ', [inv_words_dict[out] for out in a[0]])
    #print(b)
    #print(c)