In [1]:
from Utils.FS import file
from Utils.tensorflow_helper import show_graph
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from scipy.sparse import coo_matrix, dok_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from nltk.corpus import stopwords
import tensorflow as tf
import math
import TextPreprocess.words2dict as words2dict

In [2]:
np.random.seed(1234)

In [3]:
sents = brown.sents()
sents = [[token.lower() for token in sent] for sent in sents]
words = brown.words()
words = [word.lower() for word in words]

In [4]:
print("Number of tokens: {}".format(len(words)))
print("Number of sentences: {}".format(len(sents)))
print("Longest sentences length: {}".format(max([len(sent) for sent in sents])))
MAX_SENTENCE_LENGTH = max([len(sent) for sent in sents])

Number of tokens: 1161192
Number of sentences: 57340
Longest sentences length: 180


In [5]:
words_dict, inv_words_dict = words2dict.convert(words)
print(words_dict['five'])
print(inv_words_dict[334])

words_size = len(words_dict)
print("Number of unique tokens: {}".format(words_size))

334
five
Number of unique tokens: 49815


In [6]:
def batchPadding(batch, padding_symbol=words_dict['--']):
    size = max([len(record) for record in batch])
    result = np.full((len(batch), size), padding_symbol)
    for i in range(len(batch)):
        result[i][:len(batch[i])] = batch[i]
    return result

In [7]:
def dataGenerator(sents, words_dict, window_size = 2, batch_size=32, train_length=2):
    train = []
    label = []
    length = []
    while(True):
        left_window = [words_dict['--'] for i in range(window_size)]
        target = [words_dict['--'] for i in range(train_length)]
        right_window = [words_dict['--'] for i in range(window_size)]
        for sent in sents:
            for word in sent:
                right_window.append(words_dict[word])
                target.append(right_window.pop(0))
                left_window.append(target.pop(0))
                left_window.pop(0)
                
                for context in left_window + right_window:
                    train.append(list(target))
                    label.append(list([context]))
                    length.append(len(target))
                    if(len(train) == batch_size):
                        yield train, label, length
                        train = []
                        label = []
                        length = []

        print('epouch done...')

In [8]:
TRAIN_LENGTH = 1
WINDOW_SIZE = 2
BATCH_SIZE = 32

In [9]:
def visualizeData(generator):
    train, label, length = next(generator)
    for i in range(len(train)):
        print([inv_words_dict[word] for word in train[i]], [inv_words_dict[word] for word in label[i]], length[i])

generator = dataGenerator(sents[:1], words_dict, window_size = 1, batch_size=64, train_length=2)
#print(sents[0])
#visualizeData(generator)

In [10]:
generator = dataGenerator(sents, words_dict, window_size = WINDOW_SIZE, batch_size=BATCH_SIZE, train_length=TRAIN_LENGTH)

In [11]:
RNN_DIMENSION = [50]
RNN_LAYERS = 1
DIMENSION = 50
VOCABULAY_SIZE = len(words_dict)
NEGATIVE_SAMPLE = 128

In [12]:
graph = tf.Graph()

with graph.as_default():
    
        inputs = tf.placeholder(tf.int32, (None, None), name = "Input_Sentence_Word_Index")
        #OUT: (batch, time) int32
        
        input_lengths = tf.placeholder(tf.int32, (None), name = "Input_Sentence_Length")
        #OUT: (batch) int32
        
        labels = tf.placeholder(tf.int32, (None, 1), name = "Context_Word_Index")
        #OUT: (batch, 1) int32
        
        batch_size = tf.shape(inputs)[0]
        
        embeddings = tf.Variable(tf.random_uniform([VOCABULAY_SIZE, DIMENSION], -1.0, 1.0), trainable=False, name="Word2Vec")
        
        #IN: (batch, time) int32
        inputs_embed = tf.nn.embedding_lookup(embeddings, inputs, max_norm=1)
        #OUT: (batch, time, dim) float32
        
        #IN: (batch, time, dim) 
        #weights = [tf.Variable(tf.random_uniform([DIMENSION * 2, DIMENSION], minval=-1, maxval=1)) for i in range(RNN_LAYERS)]
        #bias = [tf.Variable(tf.random_uniform([DIMENSION], minval=-1, maxval=1)) for i in range(RNN_LAYERS)]
        
        #nn_input = tf.reshape(inputs_embed, (batch_size, DIMENSION))
        #nn_output = tf.ones((batch_size, DIMENSION))
        #for i in range(RNN_LAYERS):
        #    nn_output = tf.concat([nn_output, nn_input], 1)
        #    nn_output = tf.tanh(tf.matmul(nn_output, weights[i]) + bias[i])
        #out: (batch, DIMENSION)

        #IN: (batch, time, dim) float32
        #rnn_inputs = tf.transpose(inputs_embed, [1, 0, 2])
        rnn_inputs = inputs_embed
        #OUT: (batch, time, dim) float32
        
        with tf.variable_scope("LSTM") as lstm_scope:

            cell = tf.contrib.rnn.LSTMCell(RNN_DIMENSION[-1])        
            #stack = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(RNN_DIMENSION[i]) for i in range(RNN_LAYERS)])

            #rnn_tuple_state = tuple([tf.nn.rnn_cell.LSTMStateTuple(l[i][0], l[i][1]) for i in range(RNN_LAYERS)])

            #cell = tf.contrib.rnn.LSTMCell(DIMENSION, state_is_tuple=True)        
            #cell = tf.contrib.rnn.GRUCell(DIMENSION)
            initial_state = cell.zero_state(batch_size, tf.float32)

            #IN: (batch, time, DIMENSION) float32
            rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=initial_state, sequence_length=input_lengths)
            #OUT: (batch, time, RNN_DIMENSION[-1]) float32
            lstm_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=lstm_scope.name)
            print(lstm_variables)
        
        
        #IN: (batch, time, RNN_DIMENSION[-1]) float32
        index = tf.range(0, batch_size) * tf.shape(inputs)[1] + (input_lengths - 1)
        rnn_final_state = tf.gather(tf.reshape(rnn_outputs, [-1, RNN_DIMENSION[-1]]), index)
        #rnn_final_state = tf.clip_by_norm(rnn_final_state, 1, axes=[1])
        ###rnn_final_state = tf.gather(tf.reshape(rnn_outputs, [-1, DIMENSION]), index)
        #OUT: (batch, RNN_DIMENSION[-1])
        
        #IN: (batch, DIMENSION)
        nce_weights = tf.Variable(
            tf.truncated_normal([VOCABULAY_SIZE, DIMENSION],
                                stddev=1.0 / math.sqrt(DIMENSION)), name="NCE_Weights", trainable=True)

        nce_biases = tf.Variable(tf.zeros([VOCABULAY_SIZE]), name="NCE_Biases", trainable=True)
        
        loss = tf.reduce_mean(
          tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=labels,
                     inputs=rnn_final_state,
                     num_sampled=NEGATIVE_SAMPLE,
                     num_classes=VOCABULAY_SIZE))

        #optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        optimizer = tf.train.MomentumOptimizer(1.0, 0.5).minimize(loss)
        #optimizer = tf.train.AdamOptimizer().minimize(loss)
        
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        embeddings_saver = tf.train.Saver({'Words2Vec': embeddings})
        #context = tf.nn.softmax(tf.matmul(rnn_final_state, tf.transpose(nce_weights)) + nce_biases)

[<tf.Variable 'LSTM/rnn/lstm_cell/weights:0' shape=(100, 200) dtype=float32_ref>, <tf.Variable 'LSTM/rnn/lstm_cell/biases:0' shape=(200,) dtype=float32_ref>]


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [13]:
show_graph(graph.as_graph_def())

In [14]:
num_steps = 200000
MODEL = './model/2Words2Vec-tf-rnn-nce.ckpt'
WORDS2VEC_MODEL = './model/brown-Words2Vec-{}.ckpt'.format(DIMENSION)

In [15]:
def cloestWord(vec, words_vec, count=10, method='cos'):
    if method == 'cos':
        dist = np.array([ sum(vec * words_vec[key]) for key in words_vec.keys()])
        top_ten = dist.argsort()[::-1][:10]
    else:
        dist = np.array([ sum(np.square(np.array(vec) - np.array(words_vec[key]))) for key in words_vec.keys()])
        top_ten = dist.argsort()[:10]
    return [list(words_vec.keys())[i] for i in top_ten]

def cloestWord2(word, emb, count=10, method='cos'):
    if method == 'cos':
        dist = np.array([ sum(emb[words_dict[word]] * emb[i]) for i in range(emb.shape[0])])
        # dist: word index -> dist
        
        top = dist.argsort()[::-1][:count]
        # top: ranking -> word index
        
    return [(inv_words_dict[i], "%.2f" % dist[i])  for i in top]

In [16]:
with tf.Session(graph=graph) as session:
    #init.run()
    #embeddings_saver.restore(session, WORDS2VEC_MODEL)
    saver.restore(session, MODEL)
      
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels, batch_input_lengths = next(generator)
        feed_dict = {inputs: batch_inputs, labels: batch_labels, input_lengths: batch_input_lengths}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
                
                emb = embeddings.eval()
                normalize(emb, norm='l2', axis=1, copy=False)
                print('word2vec: ', cloestWord2('two', emb))
                
                dict_list = [[i] for i in range(len(words_dict))]
                dict_list_lengths = [1 for i in range(len(words_dict))]
                #emb = nn_output.eval(feed_dict={inputs: dict_list, input_lengths: dict_list_lengths})
                emb = rnn_final_state.eval(feed_dict={inputs: dict_list, input_lengths: dict_list_lengths})
                normalize(emb, norm='l2', axis=1, copy=False)
                print('rnn: ', cloestWord2('two', emb))
                #print(emb[words_dict['two']], emb[words_dict['three']])
                #print(rnn_final_state)
                #print('rnn_vec: ', emb[0], emb[1234], emb[2345])
                
                #index_eval = index.eval(feed_dict={inputs: [[1,2,3], [100,200,300]], input_lengths: [1, 3]})
                #rnn_outputs_eval = rnn_outputs.eval(feed_dict={inputs: [[1,2,3], [100,200,3300]], input_lengths: [1, 3]})
                #rnn_final_state_eval = rnn_final_state.eval(feed_dict={inputs: [[1,2,3], [100,200,3300]], input_lengths: [1, 3]})
                #print(index_eval)
                #print(rnn_outputs_eval)
                #print(rnn_final_state_eval)
                
        if step % 2000 == 0:
            save_path = saver.save(session, MODEL)
            print("Model saved in file: %s" % save_path)

INFO:tensorflow:Restoring parameters from ./model/2Words2Vec-tf-rnn-nce.ckpt


NotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for ./model/2Words2Vec-tf-rnn-nce.ckpt
	 [[Node: save/RestoreV2_8 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save/Const_0, save/RestoreV2_8/tensor_names, save/RestoreV2_8/shape_and_slices)]]

Caused by op 'save/RestoreV2_8', defined at:
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 474, in start
    ioloop.IOLoop.instance().start()
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 390, in execute_request
    user_expressions, allow_stdin)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 501, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-8fb28b8d4de5>", line 83, in <module>
    saver = tf.train.Saver()
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 1056, in __init__
    self.build()
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 1086, in build
    restore_sequentially=self._restore_sequentially)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 691, in build
    restore_sequentially, reshape)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 407, in _AddRestoreOps
    tensors = self.restore_op(filename_tensor, saveable, preferred_shard)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 247, in restore_op
    [spec.tensor.dtype])[0])
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/ops/gen_io_ops.py", line 669, in restore_v2
    dtypes=dtypes, name=name)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Users/Hoiy/anaconda/envs/tensorflow/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

NotFoundError (see above for traceback): Unsuccessful TensorSliceReader constructor: Failed to find any matching files for ./model/2Words2Vec-tf-rnn-nce.ckpt
	 [[Node: save/RestoreV2_8 = RestoreV2[dtypes=[DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save/Const_0, save/RestoreV2_8/tensor_names, save/RestoreV2_8/shape_and_slices)]]


In [None]:
with tf.Session(graph=graph) as session:
    saver.restore(session, MODEL)
    
    feed_dict={inputs: [[words_dict['two']], [words_dict['new']], [words_dict['funny']]], input_lengths: [1, 1, 1]}
    
    index_eval = index.eval(feed_dict=feed_dict)
    rnn_outputs_eval = rnn_outputs.eval(feed_dict=feed_dict)
    rnn_final_state_eval = rnn_final_state.eval(feed_dict=feed_dict)
    print("index:\n", index_eval)
    print("rnn_output:\n", rnn_outputs_eval)
    print("rnn_final:\n", rnn_final_state_eval)



In [None]:
with tf.Session(graph=graph) as session:
    #saver.restore(session, MODEL)
    embeddings_saver.restore(session, WORDS2VEC_MODEL)
    final_embeddings = embeddings.eval()

from sklearn.preprocessing import normalize
normalize(final_embeddings, norm='l2', axis=1, copy=False)

words_vec = {}
for i in range(final_embeddings.shape[0]):
    words_vec[inv_words_dict[i]] = final_embeddings[i]
  
words_vec2 = {}
with tf.Session(graph=graph) as session:
    saver.restore(session, MODEL)
    
    for key in words_vec.keys():
        feed_dict = {inputs: [[words_dict[key]]], input_lengths: [1]}
        words_vec2[key] = normalize(rnn_final_state.eval(feed_dict), norm='l2', copy=True)[0]
    

In [None]:
def sent2Context(sent):
    with tf.Session(graph=graph) as session:
        saver.restore(session, MODEL)
        feed_dict = {inputs: [[words_dict[word] for word in sent]], input_lengths: ([len(sent)])}
        context_pred = context.eval(feed_dict)
        print(rnn_final_state.eval(feed_dict))
        return [inv_words_dict[i] for i in context_pred.argsort()[0][::-1][:10]]
    
def twoWords2Vec(sent):
    with tf.Session(graph=graph) as session:
        saver.restore(session, MODEL)
        feed_dict = {inputs: [[words_dict[word] for word in sent]], input_lengths: ([len(sent)])}
        return normalize(rnn_final_state.eval(feed_dict), norm='l2', copy=True)[0]

In [None]:
print(twoWords2Vec(['run', 'faster']))

In [None]:
#print(cloestWord(twoWords2Vec(['run', 'faster'])))
#print(cloestWord(twoWords2Vec(['two', 'idiot'])))
#print(cloestWord(words_vec['two']))
#print(cloestWord(words_vec['but']))
#print(cloestWord(words_vec['man']))


#print(cloestWord(words_vec2['two'], words_vec2))
#print(cloestWord(words_vec2['but'], words_vec2))
#print(cloestWord(words_vec2['man'], words_vec2))

print(cloestWord(words_vec2['two'], words_vec2))
print(cloestWord(words_vec2['however'], words_vec2))
print(cloestWord(words_vec2['man'], words_vec2))

#print(cloestWord(twoWords2Vec(['but']), words_vec2))
#print(cloestWord(twoWords2Vec(['man']), words_vec2))
#print(cloestWord(twoWords2Vec(['two'])))
"""
def rnn_out(sent):
    with tf.Session(graph=graph) as session:
        saver.restore(session, MODEL)
        feed_dict = {inputs: [[words_dict[word] for word in sent]], input_lengths: ([len(sent)])}
        print('rnn_inputs: ', rnn_inputs.eval(feed_dict))
        print('rnn_outputs: ', rnn_outputs.eval(feed_dict))
        print('rnn_final_state:', rnn_final_state.eval(feed_dict))

rnn_out(['two'])
rnn_out(['three'])
words_vec['two']
"""

In [None]:
WORDS2VEC_MODEL = './model/brown-Words2Vec.ckpt'

graph2 = tf.Graph()

with graph2.as_default():
    embeddings2 = tf.Variable(
            tf.random_uniform([VOCABULAY_SIZE, DIMENSION], -1.0, 1.0), name='Words2Vec')
    embeddings_saver2 = tf.train.Saver({'Words2Vec': embeddings2})

In [None]:
with tf.Session(graph=graph2) as session2:
    embeddings_saver2.restore(session2, WORDS2VEC_MODEL)
    final_embeddings2 = embeddings2.eval()
    
final_embeddings2 = normalize(final_embeddings2, norm='l2', axis=1, copy=True)

words_vec3 = {}
for i in range(final_embeddings2.shape[0]):
    words_vec3[inv_words_dict[i]] = final_embeddings2[i]


In [None]:
print(cloestWord(words_vec3['two'], words_vec3))
print(cloestWord(words_vec3['however'], words_vec3))
print(cloestWord(words_vec3['man'], words_vec3))

In [None]:
print(words_dict['five'])
print(inv_words_dict[334])