In [1]:
from Utils.FS import file
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from scipy.sparse import coo_matrix, dok_matrix
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from nltk.corpus import stopwords

In [2]:
np.random.seed(1234)

In [3]:
sents = brown.sents()
sents = [[token.lower() for token in sent] for sent in sents]
words = brown.words()
words = [word.lower() for word in words]

In [4]:
print("Number of tokens: {}".format(len(words)))
print("Number of sentences: {}".format(len(sents)))
print("Longest sentences length: {}".format(max([len(sent) for sent in sents])))

Number of tokens: 1161192
Number of sentences: 57340
Longest sentences length: 180


In [5]:
def words2dicts(words):
    f_dict = {}
    index = 0
    for word in words:
        if not word in f_dict.keys():
            f_dict[word] = index
            index = index + 1
        
    return f_dict, { v:k for k, v in f_dict.items()}

In [6]:
words_dict, inv_words_dict = words2dicts(words)
words_size = len(words_dict)
print("Number of unique tokens: {}".format(words_size))

Number of unique tokens: 49815


In [7]:
def singleSideWindow(sents, words_dict, window_size, reverse = False):
    window = []
    row = []
    col = []
    data = []
        
    for sent in reversed(sents) if reverse else sents:
        for word in reversed(sent) if reverse else sent:
            for w in window:
                if w == word:
                    continue
                row.append(words_dict[word])
                col.append(words_dict[w])
                data.append(1)
            if len(window) == window_size:
                window.pop(0)
            window.append(word)
    return coo_matrix((data, (row, col)), shape=(len(words_dict), len(words_dict)), dtype='float64')
    

def sents2wordContextMatrix(sents, words_dict, window_size = 5):
    m = coo_matrix((words_size, words_size), 'float64')
    
    print('Doing forward pass...')
    m += singleSideWindow(sents, words_dict, window_size)
    
    print('Doing backward pass...')
    m += singleSideWindow(sents, words_dict, window_size, True)
    
    return m

def sents2wordCoocurrenceMatrix(sents, words_dict, window_size = 10):
    #don't really care edge cases....

    window = []
    row = []
    col = []
    data = []
    for sent in sents:
        for word in sent:
            for i in range(len(window)- 1):
                for j in range(i+1, len(window)):
                    row += [words_dict[window[i]], words_dict[window[j]]]
                    col += [words_dict[window[j]], words_dict[window[i]]]
                    data += [1, 1]
            if len(window) == window_size:
                window.pop(0)
            window.append(word)
    print('Preparing sparse matrix...')
    print('Length of data: {}'.format(len(data)))
    return coo_matrix((data, (row,col)), shape=(words_size, words_size), dtype='float64').multiply(1/len(data))


In [8]:
def sents2ConextVector(sents, words_dict, window_size = 5):
    window = [words_dict['--'] for i in range(window_size)]
        
    for sent in sents:
        for word in sent:
            yield window, words_dict[word]
            window.pop(0)
            window.append(words_dict[word])

In [9]:
def sents2freq(sents):
    freq = {}
    for sent in sents:
        for word in sent:
            if word in freq.keys():
                freq[word] += 1
            else:
                freq[word] = 1
    return freq

words_freq = sents2freq(sents)

In [87]:
from keras.utils.np_utils import to_categorical

WINDOW_SIZE = 5
#m = sents2wordContextMatrix(sents, words_dict, WINDOW_SIZE)
generator = sents2ConextVector(sents, words_dict, 5)
X = []
y = []
for context, word in generator:
    X.append(context)
    y.append(word)

In [88]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten
from keras.constraints import UnitNorm

DIMENSION = 50

emb = Embedding(len(words_dict), 50, input_length=WINDOW_SIZE, embeddings_constraint=UnitNorm(axis=1))
model = Sequential()
model.add(emb)
model.add(Flatten())
model.add(Dense(len(words_dict), activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 5, 50)             2490750   
_________________________________________________________________
flatten_13 (Flatten)         (None, 250)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 49815)             12503565  
Total params: 14,994,315
Trainable params: 14,994,315
Non-trainable params: 0
_________________________________________________________________
None


In [89]:
model.compile('adam', 'sparse_categorical_crossentropy')

In [90]:
model.fit(X, y, batch_size=32, epochs=5, validation_split=0.2)

Train on 928953 samples, validate on 232239 samples
Epoch 1/5
  3072/928953 [..............................] - ETA: 4172s - loss: 9.5836 

KeyboardInterrupt: 

In [86]:
weights = emb.get_weights()
weights = weights[0]

words_vec = {}
for i in range(weights.shape[0]):
    words_vec[inv_words_dict[i]] = weights[i]

In [None]:
#y = m.dot(m.T)
y = m

In [None]:
SVD_DIMENSION=50
u, s, vt = svds(y, k=SVD_DIMENSION)

In [None]:
words_mat = np.matmul(u, np.diag(s))

In [None]:
words_vec = {}
for i in range(words_mat.shape[0]):
    words_vec[inv_words_dict[i]] = words_mat[i]

In [None]:
def plotData(vocabs, X, Y):
    plt.clf()
    plt.figure(figsize=(36, 36))
    plt.scatter(X, Y)
    plt.axis([min(X), max(X), min(Y), max(Y)])
    for label, x, y in zip(vocabs, X, Y):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.show()    

In [None]:
def plot(vocabs, words_vec):
    X = [words_vec[vocab][0] for vocab in vocabs]
    Y = [words_vec[vocab][1] for vocab in vocabs]
    plotData(vocabs, X, Y)

In [None]:
def plotTSNE(vocabs, vectors):
    tsne = TSNE(perplexity=30, n_components=2, n_iter=5000, random_state = 7890)
    #np.set_printoptions(suppress=True)
    data = np.array([vectors[vocab] for vocab in vocabs])    
    DATA = tsne.fit_transform(data)
    X = DATA[:, 0]
    Y = DATA[:, 1]
    
    plotData(vocabs, X, Y)

In [None]:
from DataLoader import GloVe

glove = GloVe.load2('./data/GloVe/glove.6B.50d.txt')

In [None]:
vocabs = ['man', 'woman', 'king', 'queen', 'male', 'female', 'boy', 'girl']
np.random.seed(1234)

random_vocabs = []
for i in np.random.randint(0, len(words_dict), 2000):
    if inv_words_dict[i] in glove.keys():
        random_vocabs.append(inv_words_dict[i])
        
print(len(random_vocabs))

In [None]:
plotTSNE(random_vocabs, words_vec)

In [None]:
plotTSNE(random_vocabs, glove)

In [None]:
def cloestWord(word, words_vec, count = 10):
    dist = np.array([ sum(np.square(np.array(words_vec[word]) - np.array(words_vec[key]))) for key in words_vec.keys()])
    return [list(words_vec.keys())[i] for i in dist.argsort()[:10]]

In [None]:
print(cloestWord('man', words_vec))
print(cloestWord('man', glove))

In [None]:
print(cloestWord('woman', words_vec))
print(cloestWord('woman', glove))

In [None]:
print(cloestWord('however', words_vec))
print(cloestWord('however', glove))

In [None]:
print(cloestWord('his', words_vec))
print(cloestWord('his', glove))

In [None]:
print(cloestWord('zero', words_vec))
print(cloestWord('zero', glove))

In [None]:
print(cloestWord('one', words_vec))
print(cloestWord('one', glove))

In [None]:
print(cloestWord('two', words_vec))
print(cloestWord('two', glove))