In [1]:
from Utils.FS import file
from Utils.tensorflow_helper import show_graph
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from scipy.sparse import coo_matrix, dok_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from nltk.corpus import stopwords
import tensorflow as tf
import math

In [2]:
np.random.seed(1234)

In [3]:
sents = brown.sents()
sents = [[token.lower() for token in sent] for sent in sents]
words = brown.words()
words = [word.lower() for word in words]

In [4]:
print("Number of tokens: {}".format(len(words)))
print("Number of sentences: {}".format(len(sents)))
print("Longest sentences length: {}".format(max([len(sent) for sent in sents])))

Number of tokens: 1161192
Number of sentences: 57340
Longest sentences length: 180


In [5]:
def words2dicts(words):
    f_dict = {}
    index = 0
    for word in words:
        if not word in f_dict.keys():
            f_dict[word] = index
            index = index + 1
        
    return f_dict, { v:k for k, v in f_dict.items()}

In [6]:
words_dict, inv_words_dict = words2dicts(words)
words_size = len(words_dict)
print("Number of unique tokens: {}".format(words_size))

Number of unique tokens: 49815


In [7]:
def batchPadding(batch, padding_symbol=words_dict['--']):
    size = max([len(record) for record in batch])
    result = np.full((len(batch), size), padding_symbol)
    for i in range(len(batch)):
        result[i][:len(batch[i])] = batch[i]
    return result

print(batchPadding([[1,2], [1], [1,2,3]]))

[[  1   2 475]
 [  1 475 475]
 [  1   2   3]]


In [8]:
def dataGenerator(sents, words_dict, window_size = 5, batch_size=32):
    s = []
    c = []
    l = []
    while(True):
        window = ['--' for i in range(window_size)]
        for sent in sents:
            sentence = [words_dict[word] for word in sent]
            for context in window:
                s.append(sentence)
                c.append([words_dict[context]])
                l.append(len(sentence))
                if(len(c) == batch_size):
                    yield batchPadding(s), c, l
                    s = []
                    c = []
                    l = []
            for word in sent:
                #s.append(sentence)
                #c.append([words_dict[word]])
                #if(len(c) == batch_size):
                #    yield s, c
                #    s = []
                #    c = []
                window.pop(0)
                window.append(word)

        window = ['--' for i in range(window_size)]
        for sent in reversed(sents):
            for context in window:
                s.append([words_dict[word] for word in sent])
                c.append([words_dict[context]])
                l.append(len(sentence))
                if(len(c) == batch_size):
                    yield batchPadding(s), c, l
                    s = []
                    c = []
                    l = []
            for word in reversed(sent):
                window.pop(0)
                window.append(word)

In [9]:
WINDOW_SIZE = 15
BATCH_SIZE = 32
generator = dataGenerator(sents, words_dict, window_size = WINDOW_SIZE, batch_size=BATCH_SIZE)

In [10]:
DIMENSION = 50
VOCABULAY_SIZE = len(words_dict)
NEGATIVE_SAMPLE = 1000

graph = tf.Graph()

with graph.as_default():
    
        inputs = tf.placeholder(tf.int32, (BATCH_SIZE, None), name = "Input_Sentence_Word_Index")
        #OUT: (batch, time) int32
        
        input_lengths = tf.placeholder(tf.int32, (BATCH_SIZE), name = "Input_Sentence_Length")
        #OUT: (batch) int32
        
        labels = tf.placeholder(tf.int32, (BATCH_SIZE, 1), name = "Context_Word_Index")
        #OUT: (batch, 1) int32
        
        embeddings = tf.Variable(tf.zeros([VOCABULAY_SIZE, DIMENSION], tf.float32), trainable=False, name="Word2Vec")
        
        #IN: (batch, time) int32
        inputs_embed = tf.nn.embedding_lookup(embeddings, inputs, max_norm=1)
        #OUT: (batch, time, dim) float32

        #IN: (batch, time, dim) float32
        rnn_inputs = tf.transpose(inputs_embed, [1, 0, 2])
        #OUT: (time, batch, dim) float32

        cell = tf.contrib.rnn.BasicLSTMCell(DIMENSION, state_is_tuple=True)        
        initial_state = cell.zero_state(BATCH_SIZE, tf.float32)
        
        #IN: (time, batch, DIMENSION) float32
        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=initial_state, time_major=True, sequence_length=input_lengths)
        #OUT: (time, batch, DIMENSION) float32
        
        #IN: (time, batch, DIMENSION) float32
        index = tf.range(0, BATCH_SIZE) * tf.shape(rnn_outputs)[0] + (input_lengths - 1)
        rnn_final_state = tf.gather(tf.reshape(rnn_outputs, [-1, DIMENSION]), index)
        #OUT: (batch, DIMENSION)
        
        nce_weights = tf.Variable(
            tf.truncated_normal([VOCABULAY_SIZE, DIMENSION],
                                stddev=1.0 / math.sqrt(DIMENSION)), name="NCE_Weights")

        nce_biases = tf.Variable(tf.zeros([VOCABULAY_SIZE]), name="NCE_Bias")

        loss = tf.reduce_mean(
          tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=labels,
                     inputs=rnn_final_state,
                     num_sampled=NEGATIVE_SAMPLE,
                     num_classes=VOCABULAY_SIZE))

        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        #optimizer = tf.train.AdamOptimizer(1.0).minimize(loss)
        
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        embeddings_saver = tf.train.Saver({'Words2Vec': embeddings})
        context = tf.nn.softmax(tf.matmul(rnn_final_state, tf.transpose(nce_weights)) + nce_biases)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
show_graph(graph.as_graph_def())

In [None]:
num_steps = 100001
MODEL = './model/rnn-tf-nce-model.ckpt'
WORDS2VEC_MODEL = './model/brown-Words2Vec.ckpt'

with tf.Session(graph=graph) as session:
    init.run()
    embeddings_saver.restore(session, WORDS2VEC_MODEL)
    #saver.restore(session, MODEL)
      
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels, batch_input_lengths = next(generator)
        feed_dict = {inputs: batch_inputs, labels: batch_labels, input_lengths: batch_input_lengths}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
                
        if step % 20000 == 0:
            save_path = saver.save(session, MODEL)
            print("Model saved in file: %s" % save_path)

INFO:tensorflow:Restoring parameters from ./model/brown-Words2Vec.ckpt
Model saved in file: ./model/rnn-tf-nce-model.ckpt
Average loss at step  2000 :  332.03566642
Average loss at step  4000 :  75.3500075116
Average loss at step  6000 :  33.8462652137
Average loss at step  8000 :  20.0237583435
Average loss at step  10000 :  15.0754849977


In [None]:
from sklearn.preprocessing import normalize
MODEL = './model/rnn-tf-nce-model.ckpt'

with tf.Session(graph=graph) as session:
    saver.restore(session, MODEL)
    print("Model restored.")
    
    sents_vec = {}
    for sent in sents:
        feed_dict = {inputs: [[words_dict[word] for word in sent]]}
        sents_vec[' '.join(sent)] = np.array(sum(normalize(rnn_final_state.eval(feed_dict), norm='l2', copy=False)))

In [None]:
def sent2Vec(sent):
    with tf.Session(graph=graph) as session:
        saver.restore(session, MODEL)
        feed_dict = {inputs: [[words_dict[word] for word in sent]]}
        return np.array(sum(normalize(rnn_final_state.eval(feed_dict), norm='l2', copy=False)))

In [None]:
def sent2Context(sent):
    with tf.Session(graph=graph) as session:
        saver.restore(session, MODEL)
        feed_dict = {inputs: [[words_dict[word] for word in sent]]}
        context_pred = context.eval(feed_dict)
        return [inv_words_dict[i] for i in context_pred.argsort()[0][::-1][:10]]

In [None]:
print(sent2Context(['one', 'thing', 'i', 'don\'t', 'know', 'why']))