In [1]:
import numpy as np

from h_softmax import Tree
from collections import Counter

In [2]:
corpus = ["the quick brown fox jumps over the lazy dog".split(' ')]
print(f'corpus example: {corpus}')

words = Counter()
for doc in corpus:
    for word in doc:
        words[word] += 1

vocab_size = len(words)
print(f'vocab size: {vocab_size}')

word_to_ix, ix_to_word = {}, {}
for ix, (word, _) in enumerate(words.most_common()):
    word_to_ix[word] = ix
    ix_to_word[ix] = word

corpus_preprocessed = list()
for document in corpus:
    corpus_preprocessed.append([word_to_ix[word] for word in document])
print(f'preprocessed: {corpus_preprocessed}')

corpus example: [['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']]
vocab size: 8
preprocessed: [[0, 1, 2, 3, 4, 5, 0, 6, 7]]


In [3]:
def get_data(corpus, window_size=2):
    for document in corpus:
        doc_len = len(document)
        for ix in range(doc_len):
            floor = max(0, ix-window_size)
            ceil = min(doc_len, ix+window_size+1)
            
            X = document[ix]
            
            y = list()
            for word in document[floor:ix]:
                y.append(word)
            for word in document[ix+1:ceil]:
                y.append(word)
            
            yield X, y

In [None]:
class EmbeddingModel(object):
    
    def __init__(self, vocab_size, h_size, learning_rate, seed=None):
        """
        parameters
        ----------
        
        vocabulary: list of unique words
        h_size    : size of the hidden units
        """
        if seed:
            np.random.seed(seed)
        
        self.eta = learning_rate
        
        self.w1   = np.random.randn(vocab_size, h_size)
        self.tree = Tree(range(vocab_size), h_size)
        
        self._dw2 = np.zeros_like(self.tree.weights)
    
    def feed_forward(self, X, labels):
        h = self.w1[None, X].T
        
        u_s = [np.dot(self.tree[label].T, h) for label in labels]
        y = [self.sigmoid(u) for u in u_s]

        return h, y
    
    def back_propagation(self, X, labels, y_preds, h):
        dw1 = np.zeros((1, self.w1.shape[1]))
        self._dw2 *= 0

        for label, y_pred in zip(labels, y_preds):
            error = [y-sign if sign == 1 else y for y, sign in zip(y_pred, self.tree.get_path(label))]
            error = np.array(error).T

            # error   -> 3x1
            # weights -> 5x3
            dw1 += np.dot(error, self.tree[label].T)
            
            dw2 = h * error
            dw2 = dw2.T

            for ix, w2_ix in enumerate(self.tree.get_indexes(label)):
                self._dw2[:,w2_ix] += dw2[ix]

        self.w1[None, X] -= self.eta * dw1

        self.tree.weights -= self.eta * self._dw2

    def sigmoid(self, x):
        return 1./(1.+np.exp(-x))
    
    def predict(self, X):
        h = self.w1[None, X].T
        
        self
# model = EmbeddingModel(vocab_size, 5, 1e-3, 42)

epochs = 150000
for epoch in range(epochs):
    loss = 0
    for X, labels in get_data(corpus_preprocessed):
        h, y_preds = model.feed_forward(X, labels)

        for ix, label in enumerate(labels):
            for sign, y_pred in zip(model.tree.get_path(label),  y_preds[ix]):
                if sign == 1:
                    loss += -np.sum(np.log(y_pred))
                else:
                    loss += -np.sum(np.log(1-y_pred))
        
        model.back_propagation(X, labels, y_preds, h)
        
    if epoch % 1500 == 0:
        print(loss)

In [22]:
list(word_to_ix)

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog']

In [30]:
def sigmoid(x):
    return 1./(1. + np.exp(-x))

In [77]:
result = np.zeros(8)

for i in range(8):
    path = model.tree.get_path(i)
    
    result[i] = np.prod(sigmoid(model.w1[1] @ (model.tree[i]*path)))

result

array([3.33210946e-01, 1.73457018e-05, 3.33243523e-01, 3.33270506e-01,
       1.35890963e-10, 6.22835252e-09, 7.50563885e-05, 1.82615977e-04])

In [78]:
import pandas as pd

In [79]:
pd.DataFrame(np.log(result)[None], columns=list(word_to_ix))

Unnamed: 0,the,quick,brown,fox,jumps,over,lazy,dog
0,-1.09898,-10.962166,-1.098882,-1.098801,-22.719168,-18.894154,-9.497271,-8.608125


In [73]:
b = 4 * 189378 * 300
b / 2**20

216.72592163085938

In [76]:
b = 4 * 435000 * 300
b / 2**20

497.8179931640625