In [1]:
from Utils.FS import file
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from scipy.sparse import coo_matrix, dok_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from nltk.corpus import stopwords
import tensorflow as tf
import math

In [5]:
np.random.seed(1234)

In [6]:
sents = brown.sents()
sents = [[token.lower() for token in sent] for sent in sents]
words = brown.words()
words = [word.lower() for word in words]

In [7]:
print("Number of tokens: {}".format(len(words)))
print("Number of sentences: {}".format(len(sents)))
print("Longest sentences length: {}".format(max([len(sent) for sent in sents])))

Number of tokens: 1161192
Number of sentences: 57340
Longest sentences length: 180


In [8]:
def words2dicts(words):
    f_dict = {}
    index = 0
    for word in words:
        if not word in f_dict.keys():
            f_dict[word] = index
            index = index + 1
        
    return f_dict, { v:k for k, v in f_dict.items()}

In [9]:
words_dict, inv_words_dict = words2dicts(words)
words_size = len(words_dict)
print("Number of unique tokens: {}".format(words_size))

Number of unique tokens: 49815


In [10]:
def dataGenerator(sents, words_dict, window_size = 5, batch_size=32):
    s = []
    c = []
    while(True):
        window = ['--' for i in range(window_size)]
        for sent in sents:
            for context in window:
                s.append([words_dict[word] for word in sent])
                c.append([words_dict[context]])
                if(len(c) == batch_size):
                    yield s, c
                    s = []
                    c = []
            for word in sent:
                window.pop(0)
                window.append(word)

        window = ['--' for i in range(window_size)]
        for sent in reversed(sents):
            for context in window:
                s.append([words_dict[word] for word in sent])
                c.append([words_dict[context]])
                if(len(c) == batch_size):
                    yield s, c
                    s = []
                    c = []
            for word in reversed(sent):
                window.pop(0)
                window.append(word)

In [11]:
WINDOW_SIZE = 10
BATCH_SIZE = 1
generator = dataGenerator(sents, words_dict, window_size = WINDOW_SIZE, batch_size=BATCH_SIZE)

In [12]:
DIMENSION = 50
VOCABULAY_SIZE = len(words_dict)
NEGATIVE_SAMPLE = 64

graph = tf.Graph()

with graph.as_default():
    with tf.device('/cpu:0'):
    
        inputs = tf.placeholder(tf.float32, (None, None)) #(batch, time)
        batch_size = tf.shape(inputs)[0]
        input_length = tf.shape(inputs)[1]

        labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1])

        rnn_inputs  = tf.expand_dims(tf.transpose(inputs, [1, 0]), -1)  # (time, batch, in)
        cell = tf.contrib.rnn.BasicLSTMCell(DIMENSION, state_is_tuple=True)
        initial_state = cell.zero_state(batch_size, tf.float32)
        rnn_outputs, rnn_states = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=initial_state, time_major=True)    
        
        rnn_final_state = tf.reshape(tf.gather_nd(rnn_outputs, [[tf.subtract(input_length, 1)]]), [-1, DIMENSION])

        nce_weights = tf.Variable(
            tf.truncated_normal([VOCABULAY_SIZE, DIMENSION],
                                stddev=1.0 / math.sqrt(DIMENSION)))

        nce_biases = tf.Variable(tf.zeros([VOCABULAY_SIZE]))

        loss = tf.reduce_mean(
          tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=labels,
                     inputs=rnn_final_state,
                     num_sampled=NEGATIVE_SAMPLE,
                     num_classes=VOCABULAY_SIZE))

        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

In [52]:
num_steps = 100001
MODEL = './model/rnn-tf-nce-model.ckpt'

with tf.Session(graph=graph) as session:
    #init.run()
    saver.restore(session, MODEL)
      
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = next(generator)
        feed_dict = {inputs: batch_inputs, labels: batch_labels}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
                
        if step % 20000 == 0:
            save_path = saver.save(session, MODEL)
            print("Model saved in file: %s" % save_path)

INFO:tensorflow:Restoring parameters from ./model/rnn-tf-nce-model.ckpt
Model saved in file: ./model/rnn-tf-nce-model.ckpt
Average loss at step  2000 :  11.2076740854
Average loss at step  4000 :  10.1554126407
Average loss at step  6000 :  10.4242633301
Average loss at step  8000 :  9.52975619698
Average loss at step  10000 :  9.62595331028
Average loss at step  12000 :  9.48849992478
Average loss at step  14000 :  9.43170004022
Average loss at step  16000 :  9.41286372928
Average loss at step  18000 :  9.42166561843
Average loss at step  20000 :  9.33900203326
Model saved in file: ./model/rnn-tf-nce-model.ckpt
Average loss at step  22000 :  9.33691114551
Average loss at step  24000 :  9.26824543309
Average loss at step  26000 :  9.58674203391
Average loss at step  28000 :  9.26968712316
Average loss at step  30000 :  9.28938150094
Average loss at step  32000 :  9.54470630907
Average loss at step  34000 :  9.33068675509
Average loss at step  36000 :  9.7230190336
Average loss at step 

In [53]:
from sklearn.preprocessing import normalize
MODEL = './model/rnn-tf-nce-model.ckpt'

with tf.Session(graph=graph) as session:
    saver.restore(session, MODEL)
    print("Model restored.")
    
    sents_vec = {}
    for sent in sents:
        feed_dict = {inputs: [[words_dict[word] for word in sent]]}
        sents_vec[' '.join(sent)] = np.array(sum(normalize(rnn_final_state.eval(feed_dict), norm='l2', copy=False)))

INFO:tensorflow:Restoring parameters from ./model/rnn-tf-nce-model.ckpt
Model restored.


In [54]:
def sent2Vec(sent):
    with tf.Session(graph=graph) as session:
        saver.restore(session, MODEL)
        feed_dict = {inputs: [[words_dict[word] for word in sent]]}
        return np.array(sum(normalize(rnn_final_state.eval(feed_dict), norm='l2', copy=False)))

In [55]:
print(sent2Vec(sents[0]))
print(sent2Vec(sents[1]))
print(sent2Vec(sents[2]))

print(sent2Vec(sents[0]) * sent2Vec(sents[2]))

INFO:tensorflow:Restoring parameters from ./model/rnn-tf-nce-model.ckpt
[ -3.32622159e-23  -7.86810331e-29  -5.04758589e-07  -6.63168081e-20
  -6.83803166e-31   1.15277143e-09  -4.96376718e-10  -1.99860221e-27
   0.00000000e+00  -2.53019333e-01  -2.38509041e-15   0.00000000e+00
   2.55487044e-03   3.32223296e-01   3.72141510e-17   7.76481356e-20
  -2.53019392e-01   2.53019422e-01   2.16926068e-07   0.00000000e+00
  -8.16419737e-18  -3.41701992e-12   3.32223296e-01   0.00000000e+00
   4.67841193e-04  -2.53019392e-01   3.84168131e-12   6.63544582e-25
   9.72496578e-34  -3.32223296e-01   1.03643288e-06  -1.07066813e-23
   7.86664238e-34  -2.53019273e-01   0.00000000e+00  -1.60554919e-04
  -4.98425860e-23   8.64696486e-37   0.00000000e+00   1.92170653e-16
  -1.48656425e-25  -2.49830748e-07   5.72228873e-06   2.53019333e-01
   2.48085233e-11  -1.84134900e-14  -2.53019333e-01   2.68670770e-19
  -3.32223296e-01  -3.32223296e-01]
INFO:tensorflow:Restoring parameters from ./model/rnn-tf-nce-mod

In [None]:
def plotData(vocabs, X, Y):
    plt.clf()
    plt.figure(figsize=(36, 36))
    plt.scatter(X, Y)
    plt.axis([min(X), max(X), min(Y), max(Y)])
    for label, x, y in zip(vocabs, X, Y):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.show()    

In [None]:
def plot(vocabs, words_vec):
    X = [words_vec[vocab][0] for vocab in vocabs]
    Y = [words_vec[vocab][1] for vocab in vocabs]
    plotData(vocabs, X, Y)

In [None]:
def plotTSNE(vocabs, vectors):
    tsne = TSNE(perplexity=30, n_components=2, n_iter=5000, random_state = 7890, method='exact')
    #np.set_printoptions(suppress=True)
    data = np.array([vectors[vocab] for vocab in vocabs])    
    DATA = tsne.fit_transform(data)
    X = DATA[:, 0]
    Y = DATA[:, 1]
    
    plotData(vocabs, X, Y)

In [None]:
from DataLoader import GloVe

glove = GloVe.load2('./data/GloVe/glove.6B.50d.txt')

In [None]:
vocabs = ['man', 'woman', 'king', 'queen', 'male', 'female', 'boy', 'girl']
np.random.seed(1234)

random_vocabs = []
for i in np.random.randint(0, len(words_dict), 2000):
    if inv_words_dict[i] in glove.keys():
        random_vocabs.append(inv_words_dict[i])
        
print(len(random_vocabs))

In [None]:
#plotTSNE(random_vocabs, words_vec)

In [None]:
#plotTSNE(random_vocabs, glove)

In [None]:
def cloestWord(word, words_vec, count = 10, method=None):
    if method == 'cos':
        dist = np.array([ sum(words_vec[word] * words_vec[key]) for key in words_vec.keys()])
        top_ten = dist.argsort()[::-1][:10]
    else:
        dist = np.array([ sum(np.square(np.array(words_vec[word]) - np.array(words_vec[key]))) for key in words_vec.keys()])
        top_ten = dist.argsort()[:10]
    return [list(words_vec.keys())[i] for i in top_ten]

In [45]:
def cloestSent(sent, sents_vec, count=10):
    vec = sent2Vec(sent)
    dist = np.array([ sum(vec * sents_vec[key]) for key in sents_vec.keys()])
    closest = dist.argsort()[::-1][:count]
    
    return [list(sents_vec.keys())[i] for i in closest]

In [46]:
print(' '.join(sents[0]) + '\n')
print('\n\n'.join(cloestSent(sents[0], sents_vec)))

the fulton county grand jury said friday an investigation of atlanta's recent primary election produced `` no evidence '' that any irregularities took place .

INFO:tensorflow:Restoring parameters from ./model/rnn-tf-nce-model.ckpt
[ 1.00000008  0.99997272  0.99997263 ...,  0.99997267  0.99997273
  0.99997265]
the proposal would have to receive final legislative approval , by two-thirds majorities , before march 1 to be printed on the april 4 ballot , roberts said .

the suitcases had come while they were out , and had been put in their room , the concierge said .

the only day they `` have a chance to compete with large supermarkets is on sunday '' , the council's resolution said .

the hughes concern then took `` shortcuts '' on the project but got paid anyway , hemphill said .

the deeds of this team , through two seasons and in the two world's series that followed , have been written and talked about until hardly a word is left to be said .

the grand jury commented on a number of 

In [None]:
print(cloestWord('man', words_vec, method='cos'))
print(cloestWord('man', glove))

In [None]:
print(cloestWord('woman', words_vec, method='cos'))
print(cloestWord('woman', glove))

In [None]:
print(cloestWord('however', words_vec, method='cos'))
print(cloestWord('however', glove))

In [None]:
print(cloestWord('his', words_vec, method='cos'))
print(cloestWord('his', glove))

In [None]:
print(cloestWord('zero', words_vec, method='cos'))
print(cloestWord('zero', glove))

In [None]:
print(cloestWord('one', words_vec, method='cos'))
print(cloestWord('one', glove))

In [None]:
print(cloestWord('two', words_vec, method='cos'))
print(cloestWord('two', glove))