In [1]:
from Utils.FS import file
from Utils.tensorflow_helper import show_graph
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from scipy.sparse import coo_matrix, dok_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from nltk.corpus import stopwords
import tensorflow as tf
import math
import TextPreprocess.words2dict as words2dict
from tensorflow.python.layers import core as layers_core
from tensorflow.python.client import timeline

In [2]:
np.random.seed(1234)

In [3]:
sents = brown.sents()
sents = [[token.lower() for token in sent] for sent in sents]
words = brown.words()
words = [word.lower() for word in words]

In [4]:
print("Number of tokens: {}".format(len(words)))
print("Number of sentences: {}".format(len(sents)))
print("Longest sentences length: {}".format(max([len(sent) for sent in sents])))
MAX_SENTENCE_LENGTH = max([len(sent) for sent in sents])

Number of tokens: 1161192
Number of sentences: 57340
Longest sentences length: 180


In [5]:
words_dict, inv_words_dict = words2dict.convert(words)
print(words_dict['five'])
print(inv_words_dict[334])

words_size = len(words_dict)
print("Number of unique tokens: {}".format(words_size))

334
five
Number of unique tokens: 49815


In [6]:
VOCABULAY_SIZE = len(words_dict)
GO_SYMBOL = VOCABULAY_SIZE - 1
PADDING_SYMBOL = VOCABULAY_SIZE - 2
UNK_SYMBOL = VOCABULAY_SIZE - 3

In [7]:
def batchPadding(batch, padding_symbol=PADDING_SYMBOL):
    size = max([len(record) for record in batch])
    result = np.full((len(batch), size), padding_symbol)
    for i in range(len(batch)):
        result[i][:len(batch[i])] = batch[i]
    return result

def batchMask(batch):
    size = max([len(record) for record in batch])
    result = np.full((len(batch), size), 0.0)
    for i in range(len(batch)):
        result[i][:len(batch[i])] = 1
    return result

In [8]:
"""
def dataGenerator(sents, words_dict, window_size = 2, batch_size=32, train_length=2, epouch=-1, padding_symbol=PADDING_SYMBOL):
    train = []
    label = []
    length = []
    while(epouch < 0 or epouch > 0):
        left_window = [padding_symbol for i in range(window_size)]
        target = [padding_symbol for i in range(train_length)]
        right_window = [padding_symbol for i in range(window_size)]
        for sent in sents:
            for word in sent:
                right_window.append(words_dict[word])
                target.append(right_window.pop(0))
                left_window.append(target.pop(0))
                left_window.pop(0)
                
                for context in left_window + right_window:
                    train.append(list(target))
                    label.append(list([context]))
                    length.append(len(target))
                    if(len(train) == batch_size):
                        yield train, label, length
                        train = []
                        label = []
                        length = []
        epouch -= 1
        print('epouch done...')
"""

"\ndef dataGenerator(sents, words_dict, window_size = 2, batch_size=32, train_length=2, epouch=-1, padding_symbol=PADDING_SYMBOL):\n    train = []\n    label = []\n    length = []\n    while(epouch < 0 or epouch > 0):\n        left_window = [padding_symbol for i in range(window_size)]\n        target = [padding_symbol for i in range(train_length)]\n        right_window = [padding_symbol for i in range(window_size)]\n        for sent in sents:\n            for word in sent:\n                right_window.append(words_dict[word])\n                target.append(right_window.pop(0))\n                left_window.append(target.pop(0))\n                left_window.pop(0)\n                \n                for context in left_window + right_window:\n                    train.append(list(target))\n                    label.append(list([context]))\n                    length.append(len(target))\n                    if(len(train) == batch_size):\n                        yield train, label, length\

In [9]:
def sentenceGenerator(sents, words_dict, batch_size=32, epouch=-1, padding_symbol=PADDING_SYMBOL):
    train = []
    length = []
    while(epouch < 0 or epouch > 0):
        for sent in sents:
            train.append([words_dict[word] for word in sent])
            length.append(len(sent))
            if(len(train) == batch_size):
                yield batchPadding(train), length, batchMask(train)
                train = []
                length = []
        epouch -= 1
        print('epouch done...')

In [27]:
#TRAIN_LENGTH = 4
#WINDOW_SIZE = 2
BATCH_SIZE = 64

In [28]:
#generator = dataGenerator(sents, words_dict, window_size = WINDOW_SIZE, batch_size=BATCH_SIZE, train_length=TRAIN_LENGTH, epouch=1)
#bigram_list = []
#for batch_inputs, _, _ in generator:
#    bigram_list += ['_'.join([inv_words_dict[idx] for idx in batch_input]) for batch_input in batch_inputs]
        
#bigrams_dict, inv_bigrams_dict = words2dict.convert(bigram_list)
#generator = sentenceGenerator(sents, words_dict)
#batch, lengths, mask = next(generator)
#print(batch[1])
#print(lengths)
#print(mask[1])

In [29]:
"""
def visualizeData(generator):
    train, label, length = next(generator)
    for i in range(len(train)):
        print([inv_words_dict[word] for word in train[i]], [inv_words_dict[word] for word in label[i]], length[i])

generator = dataGenerator(sents[:1], words_dict, window_size = 1, batch_size=64, train_length=2)
#print(sents[0])
#visualizeData(generator)
"""

'\ndef visualizeData(generator):\n    train, label, length = next(generator)\n    for i in range(len(train)):\n        print([inv_words_dict[word] for word in train[i]], [inv_words_dict[word] for word in label[i]], length[i])\n\ngenerator = dataGenerator(sents[:1], words_dict, window_size = 1, batch_size=64, train_length=2)\n#print(sents[0])\n#visualizeData(generator)\n'

In [30]:
#generator = dataGenerator(sents, words_dict, window_size = WINDOW_SIZE, batch_size=BATCH_SIZE, train_length=TRAIN_LENGTH)
generator = sentenceGenerator(sents, words_dict, batch_size=BATCH_SIZE)

In [34]:
RNN_DIMENSION = [100]
RNN_LAYERS = len(RNN_DIMENSION)
DIMENSION = 50
NEGATIVE_SAMPLE = 128
MODE = 'train'

In [49]:
graph = tf.Graph()

with graph.as_default():
    
        #IN
        inputs = tf.placeholder(tf.int32, (None, None), name = "Input_Sentence_Word_Index")
        #OUT: (batch, time) int32
        
        batch_size = tf.shape(inputs)[0]
        steps = tf.shape(inputs)[1]
        
        #IN
        input_lengths = tf.placeholder(tf.int32, (None), name = "Input_Sentence_Length")
        #OUT: (batch) int32
                
        batch_size = tf.shape(inputs)[0]
        steps = tf.shape(inputs)[1]
        
        encoder_inputs = inputs
        #OUT: (batch, time) int32
        
        #decoder_inputs = tf.pad(tf.slice(encoder_inputs, [0,1], [batch_size, steps-1]), [[1, 0], [0, 0]])
        decoder_inputs = tf.pad(
            tf.slice(encoder_inputs, [0,0], [batch_size, steps-1]) - GO_SYMBOL, 
            [[0, 0], [1, 0]]
        ) + GO_SYMBOL
        #OUT: (batch, time) int32
        
        decoder_input_lengths = tf.reshape(input_lengths, [batch_size])
        #OUT: (batch) int32

        # assume same input length
        decoder_masks = tf.placeholder(tf.float32, (None, None), name = "Input_Sentence_Mask")
        #OUT: (batch, time)
        
        #labels = tf.placeholder(tf.int32, (None, 1), name = "Context_Word_Index")
        #OUT: (batch, 1) int32
        

        embeddings = tf.Variable(tf.random_uniform([VOCABULAY_SIZE, DIMENSION], -1.0, 1.0), trainable=False, name="Word2Vec")
        
        #IN: (batch, time) int32
        inputs_embed = tf.nn.embedding_lookup(embeddings, inputs, max_norm=1)
        #OUT: (batch, time, dim) float32
        decoder_embed = tf.nn.embedding_lookup(embeddings, decoder_inputs, max_norm=1)

        # OUT: [time [batch_size, DIMENSION]]
        
        
        ###### IN: (batch, time, DIMENSION) float32 ######
        
        with tf.variable_scope("Encoder") as encoder_scope:
            #encoder_cell = tf.contrib.rnn.LSTMCell(RNN_DIMENSION[-1])
            #cell = tf.contrib.rnn.GRUCell(RNN_DIMENSION[-1])
            stack = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.GRUCell(RNN_DIMENSION[i]) for i in range(RNN_LAYERS)])
            #stack = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(RNN_DIMENSION[i]) for i in range(RNN_LAYERS)])
            #bw_stack = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.GRUCell(RNN_DIMENSION[i]) for i in range(RNN_LAYERS)])

            #rnn_tuple_state = tuple([tf.nn.rnn_cell.LSTMStateTuple(l[i][0], l[i][1]) for i in range(RNN_LAYERS)])

            #cell = tf.contrib.rnn.LSTMCell(DIMENSION, state_is_tuple=True)        
            #cell = tf.contrib.rnn.GRUCell(DIMENSION)
            #initial_state = stack.zero_state(batch_size, tf.float32)
            #bw_initial_state = bw_stack.zero_state(batch_size, tf.float32)

            rnn_outputs, rnn_states = tf.nn.dynamic_rnn(stack, inputs_embed, initial_state=None, dtype=tf.float32, sequence_length=input_lengths)
            
            #IN: (batch, time, RNN_DIMENSION[-1]) float32
            index = tf.range(0, batch_size) * tf.shape(inputs)[1] + (input_lengths - 1)
            rnn_final_state = tf.gather(tf.reshape(rnn_outputs, [-1, RNN_DIMENSION[-1]]), index)
            #rnn_final_state = tf.clip_by_norm(rnn_final_state, 1, axes=[1])
            ###rnn_final_state = tf.gather(tf.reshape(rnn_outputs, [-1, DIMENSION]), index)
            #OUT: (batch, RNN_DIMENSION[-1])
                    
        ###### OUT: (batch, time, RNN_DIMENSION[-1]) float32 ######
        
        ###### IN: (batch, time, RNN_DIMENSION[-1]) float32 ######
        
        with tf.variable_scope("Decoder") as decoder_scope:
            decoder_cell = tf.contrib.rnn.GRUCell(RNN_DIMENSION[-1])
            #decoder_cell = tf.contrib.rnn.LSTMCell(RNN_DIMENSION[-1])
            #decoder_stack = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.GRUCell(RNN_DIMENSION[i]) for i in range(RNN_LAYERS)])
            #decoder_stack = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(RNN_DIMENSION[i]) for i in range(RNN_LAYERS)])
            
            if MODE == "train":
                helper = tf.contrib.seq2seq.TrainingHelper(
                    inputs=decoder_embed,
                    sequence_length=decoder_input_lengths)
                
            elif MODE == "infer":
                helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                    embedding=embedding,
                    start_tokens=tf.tile([GO_SYMBOL], [batch_size]),
                    end_token=END_SYMBOL)

            decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=decoder_cell,
                helper=helper,
                initial_state=rnn_states[-1],
                #output_layer=layers_core.Dense(VOCABULAY_SIZE, use_bias=True, activation=None))
                output_layer=None)
            #sequence_loss has softmax already

            decoder_outputs, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=decoder,
                output_time_major=False,
                impute_finished=False,
                maximum_iterations=None,
                parallel_iterations=32,
                swap_memory=False,
                scope=None
            )
            
            sent_nce_weights = tf.Variable(
            tf.truncated_normal([VOCABULAY_SIZE, RNN_DIMENSION[-1]],
                                stddev=1.0 / math.sqrt(RNN_DIMENSION[-1])), trainable=True)

            sent_nce_biases = tf.Variable(tf.zeros([VOCABULAY_SIZE]), trainable=True)
        
            def nce_loss(labels, inputs):
                return tf.nn.nce_loss(
                    weights=sent_nce_weights,
                    biases=sent_nce_biases,
                    labels=tf.reshape(labels, [-1, 1]),
                    inputs=inputs,
                    num_sampled=NEGATIVE_SAMPLE,
                    num_classes=VOCABULAY_SIZE)
            
            seq_loss = tf.contrib.seq2seq.sequence_loss(
                logits=decoder_outputs.rnn_output,
                targets=inputs,
                weights=decoder_masks,
                average_across_timesteps=True,
                average_across_batch=True,
                softmax_loss_function=nce_loss,
                name=None
            )
        
            loss = tf.reduce_mean(seq_loss)

            """
            helper = tf.contrib.seq2seq.TrainingHelper(decoder_inputs, decoder_input_lengths)
            decoder = tf.contrib.seq2seq.BasicDecoder(cell, helper, rnn_final_state)
        
            decoder_outputs, decoder_states = tf.contrib.seq2seq.dynamic_decode(decoder=decoder)
            print(decoder_outputs)
            #rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, inputs_embed, initial_state=rnn_final_state, sequence_length=input_lengths)
            
            #weights: A 2D Tensor of shape [batch_size x sequence_length] and dtype float. Weights constitutes the weighting of each prediction in the sequence. When using weights as masking set all valid timesteps to 1 and all padded timesteps to 0.
            loss = tf.reduce_mean(tf.contrib.seq2seq.sequence_loss(
                logits=decoder_outputs,
                targets=labels,
                weights=decoder_masks,
                average_across_timesteps=True,
                average_across_batch=True,
                softmax_loss_function=None,
                name=None
            ))
            """

        #optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        optimizer = tf.train.MomentumOptimizer(1.0, 0.5).minimize(loss)
        #optimizer = tf.train.AdamOptimizer().minimize(loss)
        
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        embeddings_saver = tf.train.Saver({'Words2Vec': embeddings})
        #context = tf.nn.softmax(tf.matmul(rnn_final_state, tf.transpose(nce_weights)) + nce_biases)

ValueError: Tensor("Encoder/rnn/while/Exit_3:0", shape=(?, 100), dtype=float32) must be from the same graph as Tensor("Decoder/decoder/Const:0", shape=(), dtype=int32).

In [44]:
#show_graph(graph.as_graph_def())

In [45]:
num_steps = 200000
MODEL = './model/seq2seq-autoencoder.ckpt'
WORDS2VEC_MODEL = './model/brown-Words2Vec-{}.ckpt'.format(DIMENSION)

In [46]:
def cloestWord(vec, words_vec, count=10, method='cos'):
    if method == 'cos':
        dist = np.array([ sum(vec * words_vec[key]) for key in words_vec.keys()])
        top_ten = dist.argsort()[::-1][:10]
    else:
        dist = np.array([ sum(np.square(np.array(vec) - np.array(words_vec[key]))) for key in words_vec.keys()])
        top_ten = dist.argsort()[:10]
    return [list(words_vec.keys())[i] for i in top_ten]

def cloestWord2(word, emb, count=10, method='cos'):
    return cloestWord3(emb[words_dict[word]], emb, count, method)

def cloestWord3(vec, emb, count=10, method='cos'):
    if method == 'cos':
        dist = np.array([ sum(vec * emb[i]) for i in range(emb.shape[0])])
        # dist: word index -> dist
        
        top = dist.argsort()[::-1][:count]
        # top: ranking -> word index
        
    return [(inv_words_dict[i], "%.2f" % dist[i])  for i in top]

def to_word_indices(words):
    return [words_dict[word] for word in words]

In [47]:
DEBUG_SIZE = 200

with tf.Session(graph=graph) as session:
    init.run()
    embeddings_saver.restore(session, WORDS2VEC_MODEL)
    #saver.restore(session, MODEL)
    
    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
    run_metadata = tf.RunMetadata()

      
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_input_lengths, batch_masks = next(generator)
        feed_dict = {inputs: batch_inputs, input_lengths: batch_input_lengths, decoder_masks: batch_masks}

        #_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata)
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        

        if step % DEBUG_SIZE == 0:
            if step > 0:
                average_loss /= DEBUG_SIZE
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
                
                emb = embeddings.eval()
                normalize(emb, norm='l2', axis=1, copy=False)
                print('word2vec: ', cloestWord2('two', emb))
                
                dict_list = [[i] for i in range(len(words_dict))]
                dict_list_lengths = [1 for i in range(len(words_dict))]
                emb = rnn_final_state.eval(feed_dict={inputs: dict_list, input_lengths: dict_list_lengths})
                normalize(emb, norm='l2', axis=1, copy=False)
                print('encoder output cloest words of "two": ', cloestWord2('two', emb))
                
                vec = rnn_final_state.eval(feed_dict={inputs: [to_word_indices(['good', 'morning'])], input_lengths: [2]})
                normalize(vec, norm='l2', axis=1, copy=False)
                print('encoder output cloest words of "good morning": ', cloestWord3(vec[0], emb))
                
                feed_dict = {inputs: [
                    to_word_indices(['once', 'upon', 'a', 'time']), 
                    to_word_indices(['this', 'cat', 'is', 'cute']),
                ], input_lengths:[4, 4], decoder_masks: [[1,1,1,1], [1,1,1,1]]}

                #print('encoder_inputs:', encoder_inputs.eval(feed_dict))
                #print('decoder_inputs:', decoder_inputs.eval(feed_dict))
                #print('decoder_masks: ', decoder_masks.eval(feed_dict))
                d_out = decoder_outputs.rnn_output.eval(feed_dict)
                print('decoder_outputs: ', [inv_words_dict[out.argsort()[::-1][0]] for out in d_out[0]])
                print('decoder_outputs: ', [inv_words_dict[out.argsort()[::-1][0]] for out in d_out[1]])

                
        if step % DEBUG_SIZE == 0:
            save_path = saver.save(session, MODEL)
            print("Model saved in file: %s" % save_path)
            
            # Create the Timeline object, and write it to a json
            tl = timeline.Timeline(run_metadata.step_stats)
            ctf = tl.generate_chrome_trace_format()
            with open('timeline.json', 'w') as f:
                f.write(ctf)


INFO:tensorflow:Restoring parameters from ./model/brown-Words2Vec-50.ckpt
Model saved in file: ./model/seq2seq-autoencoder.ckpt
Average loss at step  200 :  321.986343002
word2vec:  [('two', '1.00'), ('four', '0.94'), ('three', '0.94'), ('five', '0.92'), ('several', '0.90'), ('six', '0.88'), ('ten', '0.88'), ('few', '0.86'), ('seven', '0.83'), ('fifty', '0.82')]
encoder output cloest words of "two":  [('two', '1.00'), ('three', '1.00'), ('four', '1.00'), ('five', '1.00'), ('several', '1.00'), ('six', '1.00'), ('few', '1.00'), ('ten', '1.00'), ('seven', '1.00'), ('many', '1.00')]
encoder output cloest words of "good morning":  [('hardship', '0.89'), ('theology', '0.88'), ('lynn', '0.88'), ('masculine', '0.88'), ('bluntly', '0.88'), ('insecticide', '0.88'), ('worldly', '0.88'), ('nails', '0.88'), ('mileage', '0.88'), ('isotopic', '0.88')]
decoder_outputs:  ['his', '--', 'by', 'by']
decoder_outputs:  ['his', '--', 'by', 'two']
Model saved in file: ./model/seq2seq-autoencoder.ckpt
Average 

KeyboardInterrupt: 

In [20]:
with tf.Session(graph=graph) as session:
    embeddings_saver.restore(session, WORDS2VEC_MODEL)
    saver.restore(session, MODEL)
    
    input_sents = [
        to_word_indices(['once', 'upon', 'a', 'time']), 
        to_word_indices(['this', 'cat', 'is', 'cute']),
        to_word_indices(sents[0]),
        to_word_indices(sents[1])
    ]
    
    input_sents = batchPadding(input_sents)
                      
    feed_dict = {inputs: input_sents, input_lengths:[input_sents.shape[1] for i in range(input_sents.shape[0])], decoder_masks: batchMask(input_sents)}
    d_out = decoder_outputs.rnn_output.eval(feed_dict)
    print(seq_loss.eval(feed_dict))

    print('decoder_outputs: ', [inv_words_dict[out.argsort()[::-1][0]] for out in d_out[0]])
    print('decoder_outputs: ', [inv_words_dict[out.argsort()[::-1][0]] for out in d_out[1]])
    print('decoder_outputs: ', [inv_words_dict[out.argsort()[::-1][0]] for out in d_out[2]])
    print('decoder_outputs: ', [inv_words_dict[out.argsort()[::-1][0]] for out in d_out[3]])

INFO:tensorflow:Restoring parameters from ./model/brown-Words2Vec-50.ckpt
INFO:tensorflow:Restoring parameters from ./model/seq2seq-autoencoder.ckpt
3.92263
decoder_outputs:  ['may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may']
decoder_outputs:  ['may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may']
decoder_outputs:  ['may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'may', 'm