In [1]:
from Utils.FS import file
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from scipy.sparse import coo_matrix, dok_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from nltk.corpus import stopwords
import tensorflow as tf
import math
import TextPreprocess.words2dict as words2dict

In [2]:
np.random.seed(1234)

In [3]:
sents = brown.sents()
sents = [[token.lower() for token in sent] for sent in sents]
words = brown.words()
words = [word.lower() for word in words]

In [4]:
print("Number of tokens: {}".format(len(words)))
print("Number of sentences: {}".format(len(sents)))
print("Longest sentences length: {}".format(max([len(sent) for sent in sents])))

Number of tokens: 1161192
Number of sentences: 57340
Longest sentences length: 180


In [5]:
words_dict, inv_words_dict = words2dict.convert(words)
words_size = len(words_dict)
print("Number of unique tokens: {}".format(words_size))

Number of unique tokens: 49815


In [6]:
def singleSideWindow(sents, words_dict, window_size, reverse = False):
    window = []
    row = []
    col = []
    data = []
        
    for sent in reversed(sents) if reverse else sents:
        for word in reversed(sent) if reverse else sent:
            for w in window:
                if w == word:
                    continue
                row.append(words_dict[word])
                col.append(words_dict[w])
                data.append(1)
            if len(window) == window_size:
                window.pop(0)
            window.append(word)
    return coo_matrix((data, (row, col)), shape=(len(words_dict), len(words_dict)), dtype='float64')
    

def sents2wordContextMatrix(sents, words_dict, window_size = 5):
    m = coo_matrix((words_size, words_size), 'float64')
    
    print('Doing forward pass...')
    m += singleSideWindow(sents, words_dict, window_size)
    
    print('Doing backward pass...')
    m += singleSideWindow(sents, words_dict, window_size, True)
    
    return m

def sents2wordCoocurrenceMatrix(sents, words_dict, window_size = 10):
    #don't really care edge cases....

    window = []
    row = []
    col = []
    data = []
    for sent in sents:
        for word in sent:
            for i in range(len(window)- 1):
                for j in range(i+1, len(window)):
                    row += [words_dict[window[i]], words_dict[window[j]]]
                    col += [words_dict[window[j]], words_dict[window[i]]]
                    data += [1, 1]
            if len(window) == window_size:
                window.pop(0)
            window.append(word)
    print('Preparing sparse matrix...')
    print('Length of data: {}'.format(len(data)))
    return coo_matrix((data, (row,col)), shape=(words_size, words_size), dtype='float64').multiply(1/len(data))


In [7]:
def singleSideWindowGenerator(c, w, sents, words_dict, window_size = 5, batch_size = 32, reverse = False):
    window = ['--' for i in range(window_size)]
    for sent in reversed(sents) if reverse else sents:
        for word in reversed(sent) if reverse else sent:
            for context in window:
                c.append(words_dict[context])
                w.append([words_dict[word]])
                if(len(c) == batch_size):
                    yield c, w
                    c = []
                    w = []
            window.pop(0)
            window.append(word)

def sents2batchGenerator(sents, words_dict, window_size = 5, batch_size=32):
    c = []
    w = []

    while(True):
        window = ['--' for i in range(window_size)]
        for sent in sents:
            for word in sent:
                for context in window:
                    c.append([words_dict[context]])
                    w.append(words_dict[word])
                    if(len(c) == batch_size):
                        yield w, c
                        c = []
                        w = []
                window.pop(0)
                window.append(word)

        window = ['--' for i in range(window_size)]
        for sent in reversed(sents):
            for word in reversed(sent):
                for context in window:
                    c.append([words_dict[context]])
                    w.append(words_dict[word])
                    if(len(c) == batch_size):
                        yield w, c
                        c = []
                        w = []
                window.pop(0)
                window.append(word)

    

In [8]:
def sents2freq(sents):
    freq = {}
    for sent in sents:
        for word in sent:
            if word in freq.keys():
                freq[word] += 1
            else:
                freq[word] = 1
    return freq

words_freq = sents2freq(sents)

In [9]:
WINDOW_SIZE = 2
BATCH_SIZE = 32
generator = sents2batchGenerator(sents, words_dict, window_size = WINDOW_SIZE, batch_size=BATCH_SIZE)

In [13]:
DIMENSION = 50
VOCABULAY_SIZE = len(words_dict)
NEGATIVE_SAMPLE = 64

graph = tf.Graph()

with graph.as_default():
    
    inputs = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
    labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1])

    with tf.device('/cpu:0'):
    
        embeddings = tf.Variable(
            tf.random_uniform([VOCABULAY_SIZE, DIMENSION], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, inputs, max_norm=1)

        nce_weights = tf.Variable(
            tf.truncated_normal([VOCABULAY_SIZE, DIMENSION],
                                stddev=1.0 / math.sqrt(DIMENSION)), trainable=False)

        nce_biases = tf.Variable(tf.zeros([VOCABULAY_SIZE]), trainable=False)

        loss = tf.reduce_mean(
          tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=labels,
                     inputs=embed,
                     num_sampled=NEGATIVE_SAMPLE,
                     num_classes=VOCABULAY_SIZE))

        #optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
        optimizer = tf.train.MomentumOptimizer(1.0, 0.5).minimize(loss)
        #optimizer = tf.train.AdamOptimizer().minimize(loss) # super slow
        
        init = tf.global_variables_initializer()
        word2VecSaver = tf.train.Saver({'NCE_Weights': nce_weights, 'NCE_Biases': nce_biases})

In [14]:
def cloestWord(word, words_vec, count = 10, method=None):
    if method == 'cos':
        dist = np.array([ sum(words_vec[word] * words_vec[key]) for key in words_vec.keys()])
        top_ten = dist.argsort()[::-1][:10]
    else:
        dist = np.array([ sum(np.square(np.array(words_vec[word]) - np.array(words_vec[key]))) for key in words_vec.keys()])
        top_ten = dist.argsort()[:10]
    return [list(words_vec.keys())[i] for i in top_ten]

In [16]:
num_steps = 200000
WORDS2VEC_MODEL = './model/brown-Words2Vec-{}.ckpt'.format(DIMENSION)

with tf.Session(graph=graph) as session:
    init.run()
    word2VecSaver.restore(session, WORDS2VEC_MODEL)   
      
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = next(generator)
        feed_dict = {inputs: batch_inputs, labels: batch_labels}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
                
                emb = embeddings.eval()
                normalize(emb, norm='l2', axis=1, copy=False)
                words_vec = {}
                for i in range(emb.shape[0]):
                    words_vec[inv_words_dict[i]] = emb[i]
                print(cloestWord('two', words_vec, method='cos'))
                
                
    #save_path = word2VecSaver.save(session, MODEL)
    final_embeddings = embeddings.eval()

INFO:tensorflow:Restoring parameters from ./model/brown-Words2Vec-50.ckpt
Average loss at step  2000 :  6.57848383498
['two', 'efficient', 'other', 'real', 'for', "'", 'step', 'god', 'yale', 'is']
Average loss at step  4000 :  6.03950827336
['two', 'other', "'", 'real', 'efficient', 'step', 'time', 'attention', 'before', 'best']
Average loss at step  6000 :  5.77292838049
['two', 'other', 'political', 'first', 'men', 'best', 'western', 'time', 'step', 'day']
Average loss at step  8000 :  5.70323363233
['two', 'other', 'first', 'men', 'london', 'best', 'given', 'time', 'very', 'western']
Average loss at step  10000 :  5.48292535138
['two', 'other', 'many', 'first', 'years', 'political', 'step', 'real', 'london', 'western']
Average loss at step  12000 :  5.33735982132
['two', 'other', 'first', 'years', 'political', 'attention', 'cities', 'step', 'most', 'continued']
Average loss at step  14000 :  5.35958235884
['two', 'other', 'years', 'first', 'cities', 'attention', 'western', 'best', '

KeyboardInterrupt: 

In [None]:
from sklearn.preprocessing import normalize

normalize(final_embeddings, norm='l2', axis=1, copy=False)
print(final_embeddings.shape)

In [None]:
sum((final_embeddings[2] ** 2))

In [None]:
words_vec = {}
for i in range(final_embeddings.shape[0]):
    words_vec[inv_words_dict[i]] = final_embeddings[i]

In [None]:
def plotData(vocabs, X, Y):
    plt.clf()
    plt.figure(figsize=(36, 36))
    plt.scatter(X, Y)
    plt.axis([min(X), max(X), min(Y), max(Y)])
    for label, x, y in zip(vocabs, X, Y):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.show()    

In [None]:
def plot(vocabs, words_vec):
    X = [words_vec[vocab][0] for vocab in vocabs]
    Y = [words_vec[vocab][1] for vocab in vocabs]
    plotData(vocabs, X, Y)

In [None]:
def plotTSNE(vocabs, vectors):
    tsne = TSNE(perplexity=30, n_components=2, n_iter=5000, random_state = 7890, method='exact')
    #np.set_printoptions(suppress=True)
    data = np.array([vectors[vocab] for vocab in vocabs])    
    DATA = tsne.fit_transform(data)
    X = DATA[:, 0]
    Y = DATA[:, 1]
    
    plotData(vocabs, X, Y)

In [None]:
from DataLoader import GloVe

glove = GloVe.load2('./data/GloVe/glove.6B.50d.txt')

In [None]:
vocabs = ['man', 'woman', 'king', 'queen', 'male', 'female', 'boy', 'girl']
np.random.seed(1234)

random_vocabs = []
for i in np.random.randint(0, len(words_dict), 2000):
    if inv_words_dict[i] in glove.keys():
        random_vocabs.append(inv_words_dict[i])
        
print(len(random_vocabs))

In [None]:
#plotTSNE(random_vocabs, words_vec)

In [None]:
#plotTSNE(random_vocabs, glove)

In [None]:
print(cloestWord('man', words_vec, method='cos'))
print(cloestWord('man', glove))

In [None]:
print(cloestWord('woman', words_vec, method='cos'))
print(cloestWord('woman', glove))

In [None]:
print(cloestWord('however', words_vec, method='cos'))
print(cloestWord('however', glove))

In [None]:
print(cloestWord('his', words_vec, method='cos'))
print(cloestWord('his', glove))

In [None]:
print(cloestWord('zero', words_vec, method='cos'))
print(cloestWord('zero', glove))

In [None]:
print(cloestWord('one', words_vec, method='cos'))
print(cloestWord('one', glove))

In [None]:
print(cloestWord('two', words_vec, method='cos'))
print(cloestWord('two', glove))