In [1]:
from Utils.FS import file
from Utils.tensorflow_helper import show_graph
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from scipy.sparse import coo_matrix, dok_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from nltk.corpus import stopwords
import tensorflow as tf
import math

In [2]:
np.random.seed(1234)

In [3]:
sents = brown.sents()
sents = [[token.lower() for token in sent] for sent in sents]
words = brown.words()
words = [word.lower() for word in words]

In [4]:
print("Number of tokens: {}".format(len(words)))
print("Number of sentences: {}".format(len(sents)))
print("Longest sentences length: {}".format(max([len(sent) for sent in sents])))

Number of tokens: 1161192
Number of sentences: 57340
Longest sentences length: 180


In [5]:
def words2dicts(words):
    f_dict = {}
    index = 0
    for word in words:
        if not word in f_dict.keys():
            f_dict[word] = index
            index = index + 1
        
    return f_dict, { v:k for k, v in f_dict.items()}

In [7]:
def sents2dicts(sents):
    d = {}
    index = 0
    for sent in sents:
        sentence = '_'.join(sent)
        if not sentence in d.keys():
            d[sentence] = index
            index = index + 1
    
    return d, {v: k for k, v in d.items()}

In [9]:
words_dict, inv_words_dict = words2dicts(words)
sents_dict, inv_sents_dict = sents2dicts(sents)
words_size = len(words_dict)
sents_size = len(sents_dict)
print("Number of unique words: {}".format(words_size))
print("Number of unique sentences: {}".format(sents_size))

Number of unique words: 49815
Number of unique sentences: 56410


In [10]:
def dataGenerator(sents, sents_dict, window_size = 5, batch_size=32):
    s = []
    c = []
    while(True):
        window = [['--'] for i in range(window_size)]
        for sent in sents:
            sentence = [words_dict[word] for word in sent]
            for context_sent in window:
                context_sentence = [words_dict[word] for word in context_sent]
                s.append(sentence)
                c.append(context_sentence)
                if(len(c) == batch_size):
                    yield s, c
                    s = []
                    c = []
            window.pop(0)
            window.append(sent)

        window = [['--'] for i in range(window_size)]
        for sent in reversed(sents):
            sentence = [words_dict[word] for word in sent]
            for context_sent in window:
                context_sentence = [words_dict[word] for word in context_sent]
                s.append(sentence)
                c.append(context_sentence)
                if(len(c) == batch_size):
                    yield s, c
                    s = []
                    c = []
            window.pop(0)
            window.append(sent)


In [11]:
WINDOW_SIZE = 2
BATCH_SIZE = 1
generator = dataGenerator(sents, words_dict, window_size = WINDOW_SIZE, batch_size=BATCH_SIZE)

In [39]:
DIMENSION = 50
VOCABULAY_SIZE = len(words_dict)
NEGATIVE_SAMPLE = 64

graph = tf.Graph()

with graph.as_default():
    
        inputs = tf.placeholder(tf.int32, (None, None), name = "Input_SentenceWord_Index")
        #OUT: (batch: 1, time) int32
        
        #IN: (batch: 1, time) int32
        first_inputs = tf.gather(inputs, 0)
        #OUT: (time) int32
        
        labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1], name = "Context_Word_Index")
        #OUT: (batch: 1, 1) int32
        
        #IN: (batch: 1, 1) int32
        first_labels = tf.gather(labels, 0)
        #OUT: (1) int32
        
        
        batch_size = tf.shape(inputs, name="Batch_Size")[0]
        input_length = tf.shape(inputs, name="Sentence_Length")[1]
        
        embeddings = tf.Variable(tf.zeros([VOCABULAY_SIZE, DIMENSION], tf.float32), trainable=False, name="Word2Vec")
        
        
        #IN: (time) int32
        inputs_embed = tf.nn.embedding_lookup(embeddings, first_inputs, max_norm=1)
        #OUT: (time, dim) float32

        #OUT: (batch: 1, 1) int32
        #labels_embed = tf.nn.embedding_lookup(embeddings, first_labels, max_norm=1)
        #OUT: (1, dim) float3

        #IN: (time, dim) float32
        rnn_inputs = tf.transpose(tf.expand_dims(inputs_embed, 0), [1, 0, 2])
        #OUT: (time, batch: 1, dim) float32

        cell = tf.contrib.rnn.BasicLSTMCell(DIMENSION, state_is_tuple=True)        
        initial_state = cell.zero_state(batch_size, tf.float32)
        
        #IN: (time, batch, DIMENSION) float32
        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=initial_state, time_major=True)    
        #OUT: (time, batch, DIMENSION) float32
        
        #IN: (time, batch, DIMENSION) float32
        rnn_final_state = tf.gather(rnn_outputs, tf.subtract(input_length, 1))
        #OUT: (batch, DIMENSION)
        
        nce_weights = tf.Variable(
            tf.truncated_normal([VOCABULAY_SIZE, DIMENSION],
                                stddev=1.0 / math.sqrt(DIMENSION)), name="NCE_Weights")

        nce_biases = tf.Variable(tf.zeros([VOCABULAY_SIZE]), name="NCE_Bias")

        loss = tf.reduce_mean(
          tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=labels,
                     inputs=rnn_final_state,
                     num_sampled=NEGATIVE_SAMPLE,
                     num_classes=VOCABULAY_SIZE))

        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        #optimizer = tf.train.AdamOptimizer(1.0).minimize(loss)
        
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        embeddings_saver = tf.train.Saver({'Words2Vec': embeddings})
        context = tf.nn.softmax(tf.matmul(rnn_final_state, tf.transpose(nce_weights)) + nce_biases)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [10]:
show_graph(graph.as_graph_def())

In [28]:
num_steps = 100001
MODEL = './model/rnn-tf-nce-model.ckpt'
WORDS2VEC_MODEL = './model/brown-Words2Vec.ckpt'

with tf.Session(graph=graph) as session:
    #init.run()
    #embeddings_saver.restore(session, WORDS2VEC_MODEL)
    saver.restore(session, MODEL)
    
      
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = next(generator)
        feed_dict = {inputs: batch_inputs, labels: batch_labels}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
                
        if step % 20000 == 0:
            save_path = saver.save(session, MODEL)
            print("Model saved in file: %s" % save_path)

INFO:tensorflow:Restoring parameters from ./model/rnn-tf-nce-model.ckpt
Model saved in file: ./model/rnn-tf-nce-model.ckpt
Average loss at step  2000 :  4.42873116758
Average loss at step  4000 :  4.99640004569
Average loss at step  6000 :  5.04272883257
Average loss at step  8000 :  4.89267559397
Average loss at step  10000 :  4.91183442573
Average loss at step  12000 :  4.91512426944
Average loss at step  14000 :  4.35134663039
Average loss at step  16000 :  4.85399229996
Average loss at step  18000 :  4.55533740999
Average loss at step  20000 :  4.5741002069
Model saved in file: ./model/rnn-tf-nce-model.ckpt
Average loss at step  22000 :  4.7582748822
Average loss at step  24000 :  4.62516883402
Average loss at step  26000 :  4.64127873324
Average loss at step  28000 :  4.58614637515
Average loss at step  30000 :  4.59192995227
Average loss at step  32000 :  4.53417497756
Average loss at step  34000 :  4.60397897445
Average loss at step  36000 :  4.58622804238
Average loss at step  

In [29]:
from sklearn.preprocessing import normalize
MODEL = './model/rnn-tf-nce-model.ckpt'

with tf.Session(graph=graph) as session:
    saver.restore(session, MODEL)
    print("Model restored.")
    
    sents_vec = {}
    for sent in sents:
        feed_dict = {inputs: [[words_dict[word] for word in sent]]}
        sents_vec[' '.join(sent)] = np.array(sum(normalize(rnn_final_state.eval(feed_dict), norm='l2', copy=False)))

INFO:tensorflow:Restoring parameters from ./model/rnn-tf-nce-model.ckpt
Model restored.


In [30]:
def sent2Vec(sent):
    with tf.Session(graph=graph) as session:
        saver.restore(session, MODEL)
        feed_dict = {inputs: [[words_dict[word] for word in sent]]}
        return np.array(sum(normalize(rnn_final_state.eval(feed_dict), norm='l2', copy=False)))

In [107]:
def sent2Context(sent):
    with tf.Session(graph=graph) as session:
        saver.restore(session, MODEL)
        feed_dict = {inputs: [[words_dict[word] for word in sent]]}
        context_pred = context.eval(feed_dict)
        return [inv_words_dict[i] for i in context_pred.argsort()[0][::-1][:10]]

In [108]:
print(sent2Context(['one', 'thing', 'i', 'don\'t', 'know', 'why']))

INFO:tensorflow:Restoring parameters from ./model/rnn-tf-nce-model.ckpt
['to', 'at', 'be', 'big', 'party', 'but', 'little', 'he', 'into', 'still']
