In [1]:
from Utils.FS import file
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from scipy.sparse import coo_matrix, dok_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE

In [None]:
sents = brown.sents()
sents = [[token.lower() for token in sent] for sent in sents]
words = brown.words()
words = [word.lower() for word in words]

In [None]:
print("Number of tokens: {}".format(len(words)))
print("Number of sentences: {}".format(len(sents)))
print("Longest sentences length: {}".format(max([len(sent) for sent in sents])))

In [None]:
def words2dicts(words):
    f_dict = {}
    index = 0
    for word in words:
        if not word in f_dict.keys():
            f_dict[word] = index
            index = index + 1
        
    return f_dict, { v:k for k, v in f_dict.items()}

#def sents2wordContextMatrix(sents):
    

In [None]:
words_dict, inv_words_dict = words2dicts(words)
words_size = len(words_dict)
print("Number of unique tokens: {}".format(words_size))

In [None]:
def sents2wordContextMatrix(sents, words_dict, window = 5):
    m = coo_matrix((words_size, words_size), 'float64')
    
    print('Doing forward pass...')
    back = []
    row = []
    col = []
    data = []
    for sent in sents:
        for word in sent:
            row += [words_dict[word] for b in back]
            col += [words_dict[b] for b in back]
            data += [1 for b in back]
            if len(back) == window:
                back.pop(0)
            back.append(word)
    
    print('Doing backward pass...')
    back = []
    for sent in reversed(sents):
        for word in reversed(sent):
            row += [words_dict[word] for b in back]
            col += [words_dict[b] for b in back]
            data += [1 for b in back]
            if len(back) == window:
                back.pop(0)
            back.append(word)
    
    return coo_matrix((data, (row, col)), shape=(words_size, words_size), dtype='float64')

In [None]:
def sents2freq(sents):
    freq = {}
    for sent in sents:
        for word in sent:
            if word in freq.keys():
                freq[word] += 1
            else:
                freq[word] = 1
    return freq

words_freq = sents2freq(sents)

In [None]:
m = sents2wordContextMatrix(sents, words_dict, 10)

In [None]:
norm_m = normalize(m, norm='l2', axis=1, copy=True, return_norm=False)

print(norm_m.shape)
row = norm_m[1,:].todense()
row = np.square(row)
np.sum(row)

In [None]:
u, s, vt = svds(norm_m, k=50)
#word_vec = u.dot(np.diag(s))
word_vec = np.matmul(u, np.diag(s))
print(word_vec.shape)

In [None]:
word_vec = normalize(word_vec, norm='l2', axis=1, copy=True, return_norm=False)

In [None]:
def plot(vocabs, words_vec):
    x = [words_vec[vocab][0] for vocab in vocabs]
    y = [words_vec[vocab][1] for vocab in vocabs]
    
    plt.scatter(x,y)
    for i, name in enumerate(vocabs):
        plt.text(x[i], y[i], vocabs[i])
    plt.axis([min(x), max(x), min(y), max(y)])
    plt.show()

    


In [None]:
vocabs = ['man', 'woman', 'king', 'queen', 'male', 'female', 'boy', 'girl']
#x, y = plotVocabs(vocabs, [u[words_dict[vocab]] for vocab in vocabs])
x = [word_vec[words_dict[vocab]][::-1][0] for vocab in vocabs]
y = [word_vec[words_dict[vocab]][::-1][1] for vocab in vocabs]

[words_freq[vocab] for vocab in vocabs]

In [None]:
plot(vocabs, words_vec)

In [2]:
from DataLoader import GloVe

glove = GloVe.load2('/Users/Shared/data/glove.6B/glove.6B.50d.txt')

Start: Loading Glove Model
End: Loaded 400000 rows.


In [3]:
vocabs = ['man', 'woman', 'king', 'queen', 'male', 'female', 'boy', 'girl']
plot(vocabs, glove)

NameError: name 'plot' is not defined

In [4]:
def plotTSNE(vocabs, words_vec):
    tsne = TSNE(n_components=2, random_state=0)
    #np.set_printoptions(suppress=True)
    data = np.array([words_vec[vocab] for vocab in vocabs])
    print(data.shape)
    Y = tsne.fit_transform(data)
    
    plt.scatter(Y[:, 0], Y[:, 1])
    for label, x, y in zip(vocabs, Y[:, 0], Y[:, 1]):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.show()    

In [None]:
plotTSNE(vocabs, glove)

In [None]:
from sklearn.manifold import TSNE
import numpy as np



In [None]:
from sklearn.manifold import TSNE

def main():
 
    embeddings_file = sys.argv[1]
    wv, vocabulary = load_embeddings(embeddings_file)
 
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(wv[:1000,:])
 
    plt.scatter(Y[:, 0], Y[:, 1])
    for label, x, y in zip(vocabulary, Y[:, 0], Y[:, 1]):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.show()
 
 
def load_embeddings(file_name):
 
    with codecs.open(file_name, 'r', 'utf-8') as f_in:
        vocabulary, wv = zip(*[line.strip().split(' ', 1) for line in 
f_in])
    wv = np.loadtxt(wv)
    return wv, vocabulary