In [1]:
from Utils.FS import file
from Utils.tensorflow_helper import show_graph
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from scipy.sparse import coo_matrix, dok_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from nltk.corpus import stopwords
import tensorflow as tf
import math
import TextPreprocess.words2dict as words2dict

In [2]:
np.random.seed(1234)

In [3]:
sents = brown.sents()
sents = [[token.lower() for token in sent] for sent in sents]
words = brown.words()
words = [word.lower() for word in words]

In [4]:
print("Number of tokens: {}".format(len(words)))
print("Number of sentences: {}".format(len(sents)))
print("Longest sentences length: {}".format(max([len(sent) for sent in sents])))
MAX_SENTENCE_LENGTH = max([len(sent) for sent in sents])

Number of tokens: 1161192
Number of sentences: 57340
Longest sentences length: 180


In [5]:
words_dict, inv_words_dict = words2dict.convert(words)
print(words_dict['five'])
print(inv_words_dict[334])

words_size = len(words_dict)
print("Number of unique tokens: {}".format(words_size))

334
five
Number of unique tokens: 49815


In [6]:
def batchPadding(batch, padding_symbol=words_dict['--']):
    size = max([len(record) for record in batch])
    result = np.full((len(batch), size), padding_symbol)
    for i in range(len(batch)):
        result[i][:len(batch[i])] = batch[i]
    return result

In [7]:
def dataGenerator(sents, words_dict, window_size = 2, batch_size=32, train_length=2):
    train = []
    label = []
    length = []
    while(True):
        left_window = [words_dict['--'] for i in range(window_size)]
        target = [words_dict['--'] for i in range(train_length)]
        right_window = [words_dict['--'] for i in range(window_size)]
        for sent in sents:
            for word in sent:
                right_window.append(words_dict[word])
                target.append(right_window.pop(0))
                left_window.append(target.pop(0))
                left_window.pop(0)
                
                for context in left_window + right_window:
                    train.append(list(target))
                    label.append(list([context]))
                    length.append(len(target))
                    if(len(train) == batch_size):
                        yield train, label, length
                        train = []
                        label = []
                        length = []

        print('epouch done...')

In [35]:
TRAIN_LENGTH = 2
WINDOW_SIZE = 2
BATCH_SIZE = 256

In [36]:
def visualizeData(generator):
    train, label, length = next(generator)
    for i in range(len(train)):
        print([inv_words_dict[word] for word in train[i]], [inv_words_dict[word] for word in label[i]], length[i])

generator = dataGenerator(sents[:1], words_dict, window_size = 1, batch_size=64, train_length=2)
#print(sents[0])
#visualizeData(generator)

In [37]:
generator = dataGenerator(sents, words_dict, window_size = WINDOW_SIZE, batch_size=BATCH_SIZE, train_length=TRAIN_LENGTH)

In [38]:
DIMENSION = 50
VOCABULAY_SIZE = len(words_dict)
NEGATIVE_SAMPLE = 64

In [39]:
graph = tf.Graph()

with graph.as_default():
    
        inputs = tf.placeholder(tf.int32, (None, None), name = "Input_Sentence_Word_Index")
        #OUT: (batch, time) int32
        
        input_lengths = tf.placeholder(tf.int32, (None), name = "Input_Sentence_Length")
        #OUT: (batch) int32
        
        labels = tf.placeholder(tf.int32, (None, 1), name = "Context_Word_Index")
        #OUT: (batch, 1) int32
        
        batch_size = tf.shape(inputs)[0]
        embeddings = tf.Variable(tf.zeros([VOCABULAY_SIZE, DIMENSION], tf.float32), trainable=False, name="Word2Vec")
        
        #IN: (batch, time) int32
        inputs_embed = tf.nn.embedding_lookup(embeddings, inputs, max_norm=1)
        #OUT: (batch, time, dim) float32

        #IN: (batch, time, dim) float32
        rnn_inputs = tf.transpose(inputs_embed, [1, 0, 2])
        ###rnn_inputs = inputs_embed
        #OUT: (time, batch, dim) float32

        cell = tf.contrib.rnn.LSTMCell(DIMENSION, state_is_tuple=True)        
        #cell = tf.contrib.rnn.GRUCell(DIMENSION)
        initial_state = cell.zero_state(batch_size, tf.float32)
        
        #IN: (time, batch, DIMENSION) float32
        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=initial_state, sequence_length=input_lengths, time_major=True)
        #OUT: (time, batch, DIMENSION) float32
        
        #IN: (time, batch, DIMENSION) float32
        index = tf.range(0, batch_size) * tf.shape(inputs)[1] + (input_lengths - 1)
        rnn_final_state = tf.gather(tf.reshape(tf.transpose(rnn_outputs, (1,0,2)), [-1, DIMENSION]), index)
        rnn_final_state = tf.clip_by_norm(rnn_final_state, 1, axes=[1])
        ###rnn_final_state = tf.gather(tf.reshape(rnn_outputs, [-1, DIMENSION]), index)
        #OUT: (batch, DIMENSION)
        
        nce_weights = tf.Variable(
            tf.truncated_normal([VOCABULAY_SIZE, DIMENSION],
                                stddev=1.0 / math.sqrt(DIMENSION)), name="NCE_Weights")

        nce_biases = tf.Variable(tf.zeros([VOCABULAY_SIZE]), name="NCE_Bias")

        loss = tf.reduce_mean(
          tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=labels,
                     inputs=rnn_final_state,
                     num_sampled=NEGATIVE_SAMPLE,
                     num_classes=VOCABULAY_SIZE))

        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        #optimizer = tf.train.AdamOptimizer(1.0).minimize(loss)
        
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        embeddings_saver = tf.train.Saver({'Words2Vec': embeddings})
        context = tf.nn.softmax(tf.matmul(rnn_final_state, tf.transpose(nce_weights)) + nce_biases)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [40]:
#show_graph(graph.as_graph_def())

In [41]:
num_steps = 200000
MODEL = './model/2Words2Vec-tf-rnn-nce.ckpt'
WORDS2VEC_MODEL = './model/brown-Words2Vec.ckpt'



In [48]:
with tf.Session(graph=graph) as session:
    #init.run()
    #embeddings_saver.restore(session, WORDS2VEC_MODEL)
    saver.restore(session, MODEL)
      
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels, batch_input_lengths = next(generator)
        feed_dict = {inputs: batch_inputs, labels: batch_labels, input_lengths: batch_input_lengths}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
                
        if step % 2000 == 0:
            save_path = saver.save(session, MODEL)
            print("Model saved in file: %s" % save_path)

INFO:tensorflow:Restoring parameters from ./model/2Words2Vec-tf-rnn-nce.ckpt
Model saved in file: ./model/2Words2Vec-tf-rnn-nce.ckpt
Average loss at step  2000 :  4.96578600144
Model saved in file: ./model/2Words2Vec-tf-rnn-nce.ckpt
Average loss at step  4000 :  4.93373045754
Model saved in file: ./model/2Words2Vec-tf-rnn-nce.ckpt
Average loss at step  6000 :  4.94054861641
Model saved in file: ./model/2Words2Vec-tf-rnn-nce.ckpt
Average loss at step  8000 :  4.91869240141
Model saved in file: ./model/2Words2Vec-tf-rnn-nce.ckpt
Average loss at step  10000 :  4.88599774981
Model saved in file: ./model/2Words2Vec-tf-rnn-nce.ckpt
Average loss at step  12000 :  4.85607321215
Model saved in file: ./model/2Words2Vec-tf-rnn-nce.ckpt
Average loss at step  14000 :  4.89905647182
Model saved in file: ./model/2Words2Vec-tf-rnn-nce.ckpt
Average loss at step  16000 :  4.84841774702
Model saved in file: ./model/2Words2Vec-tf-rnn-nce.ckpt
epouch done...
Average loss at step  18000 :  4.85298632336
Mod

In [43]:
with tf.Session(graph=graph) as session:
    #saver.restore(session, MODEL)
    embeddings_saver.restore(session, WORDS2VEC_MODEL)
    final_embeddings = embeddings.eval()

from sklearn.preprocessing import normalize
normalize(final_embeddings, norm='l2', axis=1, copy=False)

words_vec = {}
for i in range(final_embeddings.shape[0]):
    words_vec[inv_words_dict[i]] = final_embeddings[i]
  
words_vec2 = {}
with tf.Session(graph=graph) as session:
    saver.restore(session, MODEL)
    
    for key in words_vec.keys():
        feed_dict = {inputs: [[words_dict[key]]], input_lengths: [1]}
        words_vec2[key] = normalize(rnn_final_state.eval(feed_dict), norm='l2', copy=True)[0]
    

INFO:tensorflow:Restoring parameters from ./model/brown-Words2Vec.ckpt
INFO:tensorflow:Restoring parameters from ./model/2Words2Vec-tf-rnn-nce.ckpt


In [44]:
def sent2Context(sent):
    with tf.Session(graph=graph) as session:
        saver.restore(session, MODEL)
        feed_dict = {inputs: [[words_dict[word] for word in sent]], input_lengths: ([len(sent)])}
        context_pred = context.eval(feed_dict)
        print(rnn_final_state.eval(feed_dict))
        return [inv_words_dict[i] for i in context_pred.argsort()[0][::-1][:10]]
    
def twoWords2Vec(sent):
    with tf.Session(graph=graph) as session:
        saver.restore(session, MODEL)
        feed_dict = {inputs: [[words_dict[word] for word in sent]], input_lengths: ([len(sent)])}
        return normalize(rnn_final_state.eval(feed_dict), norm='l2', copy=True)[0]

In [45]:
print(twoWords2Vec(['run', 'faster']))

INFO:tensorflow:Restoring parameters from ./model/2Words2Vec-tf-rnn-nce.ckpt
[-0.05923763  0.2136212  -0.14718555  0.06965707  0.00332109  0.07057834
  0.05668808 -0.02834846  0.23397775  0.14585446  0.13615161 -0.13666223
 -0.10390599 -0.1941687  -0.04024365  0.02982403  0.19603655  0.27186435
 -0.01047328  0.19310986 -0.0632597  -0.16442876 -0.00136202 -0.18368034
 -0.17697984  0.06736191 -0.04950423  0.21467388 -0.08571327  0.07059266
  0.14630686  0.20384529  0.17110957  0.15202926 -0.17488046 -0.18977112
 -0.11714847  0.0124442   0.15827432 -0.19323823  0.24799895 -0.18233418
 -0.00799689 -0.09322709  0.0574378  -0.02100265  0.00937227  0.14544383
  0.21380867  0.08619152]


In [46]:
def cloestWord(vec, words_vec, count=10, method='cos'):
    if method == 'cos':
        dist = np.array([ sum(vec * words_vec[key]) for key in words_vec.keys()])
        top_ten = dist.argsort()[::-1][:10]
    else:
        dist = np.array([ sum(np.square(np.array(vec) - np.array(words_vec[key]))) for key in words_vec.keys()])
        top_ten = dist.argsort()[:10]
    return [list(words_vec.keys())[i] for i in top_ten]

In [49]:
#print(cloestWord(twoWords2Vec(['run', 'faster'])))
#print(cloestWord(twoWords2Vec(['two', 'idiot'])))
#print(cloestWord(words_vec['two']))
#print(cloestWord(words_vec['but']))
#print(cloestWord(words_vec['man']))


#print(cloestWord(words_vec2['two'], words_vec2))
#print(cloestWord(words_vec2['but'], words_vec2))
#print(cloestWord(words_vec2['man'], words_vec2))

print(cloestWord(words_vec2['two'], words_vec2))
print(cloestWord(words_vec2['however'], words_vec2))
print(cloestWord(words_vec2['man'], words_vec2))

#print(cloestWord(twoWords2Vec(['but']), words_vec2))
#print(cloestWord(twoWords2Vec(['man']), words_vec2))
#print(cloestWord(twoWords2Vec(['two'])))
"""
def rnn_out(sent):
    with tf.Session(graph=graph) as session:
        saver.restore(session, MODEL)
        feed_dict = {inputs: [[words_dict[word] for word in sent]], input_lengths: ([len(sent)])}
        print('rnn_inputs: ', rnn_inputs.eval(feed_dict))
        print('rnn_outputs: ', rnn_outputs.eval(feed_dict))
        print('rnn_final_state:', rnn_final_state.eval(feed_dict))

rnn_out(['two'])
rnn_out(['three'])
words_vec['two']
"""

['two', 'six', 'st.', 'four', 'three', 'five', 'last', 'other', 'du', 'rhode']
['however', 'moreover', 'redder', 'etc.', 'sinful', 'irritability', 'fantasies', 'obstinate', 'sohn', 'isles']
['man', 'desired', 'interesting', 'plain', 'identified', 'applying', 'mother', 'long', 'officially', 'wonderful']


"\ndef rnn_out(sent):\n    with tf.Session(graph=graph) as session:\n        saver.restore(session, MODEL)\n        feed_dict = {inputs: [[words_dict[word] for word in sent]], input_lengths: ([len(sent)])}\n        print('rnn_inputs: ', rnn_inputs.eval(feed_dict))\n        print('rnn_outputs: ', rnn_outputs.eval(feed_dict))\n        print('rnn_final_state:', rnn_final_state.eval(feed_dict))\n\nrnn_out(['two'])\nrnn_out(['three'])\nwords_vec['two']\n"

In [31]:
WORDS2VEC_MODEL = './model/brown-Words2Vec.ckpt'

graph2 = tf.Graph()

with graph2.as_default():
    embeddings2 = tf.Variable(
            tf.random_uniform([VOCABULAY_SIZE, DIMENSION], -1.0, 1.0), name='Words2Vec')
    embeddings_saver2 = tf.train.Saver({'Words2Vec': embeddings2})

In [32]:
with tf.Session(graph=graph2) as session2:
    embeddings_saver2.restore(session2, WORDS2VEC_MODEL)
    final_embeddings2 = embeddings2.eval()
    
final_embeddings2 = normalize(final_embeddings2, norm='l2', axis=1, copy=True)

words_vec3 = {}
for i in range(final_embeddings2.shape[0]):
    words_vec3[inv_words_dict[i]] = final_embeddings2[i]


INFO:tensorflow:Restoring parameters from ./model/brown-Words2Vec.ckpt


In [33]:
print(cloestWord(words_vec3['two'], words_vec3))
print(cloestWord(words_vec3['however'], words_vec3))
print(cloestWord(words_vec3['man'], words_vec3))

['two', 'three', 'several', 'four', 'other', 'five', 'six', 'ten', 'each', 'types']
['however', 'therefore', 'moreover', 'indeed', 'nevertheless', 'jr.', 'especially', 'etc.', 'finally', 'instance']
['man', 'woman', 'person', 'child', 'boy', 'killed', 'girl', 'married', 'shot', 'further']


In [34]:
print(words_dict['five'])
print(inv_words_dict[334])

334
five
