In [1]:
import numpy as np
from collections import Counter

In [33]:
corpus = ["the quick brown fox jumps over the lazy dog".split(' ')]
print(f'corpus example: {corpus}')

words = Counter()
for doc in corpus:
    for word in doc:
        words[word] += 1

vocab_size = len(words)
print(f'vocab size: {vocab_size}')

word_to_ix, ix_to_word = {}, {}
for ix, (word, _) in enumerate(words.most_common()):
    word_to_ix[word] = ix
    ix_to_word[ix] = word

corpus_preprocessed = list()
for document in corpus:
    corpus_preprocessed.append([word_to_ix[word] for word in document])
print(f'preprocessed: {corpus_preprocessed}')

corpus example: [['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']]
vocab size: 8
preprocessed: [[0, 1, 2, 3, 4, 5, 0, 6, 7]]


In [169]:
class Leaf:
    
    def __init__(self, value):
        self.path = list()
        self.value = value
    
    def __add__(self, value):
        self.path.append(value)
        return self

class Node:
    
    def __init__(self, v1, v2, h_size):
        self.left  = v1
        self.right = v2
        
        self.left  += 1
        self.right += -1
        
        self.value = np.random.randn(h_size, 1)
    
    def __getitem__(self, index):
        node_len = len(self.leaves[index].path)
        
        nodes = []
        node = self
        for path in self.leaves[index].path:
            nodes.append((path, node))
            if path == 1:
                node = node.left
            else:
                node = node.right
                
        return nodes
    
    def __add__(self, value):
        self.left = self.left + value
        self.right = self.right + value
        return self
    
    @property
    def left(self):
        return self.__left
    
    @left.setter
    def left(self, value):
        self.__left = value
    
    @property
    def right(self):
        return self.__right
    
    @right.setter
    def right(self, value):
        self.__right = value
    
    @property
    def leaves(self):
        return self.__leaves
    
    @leaves.setter
    def leaves(self, leaves):
        self.__leaves = leaves
    
    def __repr__(self):
        return str(self.value)
    
def build_tree(leaves, h_size):
    ref_leaves = {leaf:Leaf(leaf) for leaf in leaves}
    leaves = list(ref_leaves.values())

    nodes = list()

    while True:
        if len(leaves) > 1:
            node = Node(leaves.pop(), leaves.pop(), h_size)
            nodes.append(node)
        elif len(nodes) > 1:
            node = Node(nodes.pop(), nodes.pop(), h_size)
            nodes.insert(0, node)
        else:
            node = Node(nodes.pop(), leaves.pop(), h_size)
            nodes.append(node)

        if len(leaves) == 0 and len(nodes) == 1:
            root = nodes[-1]
            root.leaves = ref_leaves
            return root

np.random.seed(42)

leaves = range(8)
root = build_tree(leaves, 5)

print(root[0])

[(-1, [[-0.60170661]
 [ 1.85227818]
 [-0.01349722]
 [-1.05771093]
 [ 0.82254491]]), (1, [[ 0.11092259]
 [-1.15099358]
 [ 0.37569802]
 [-0.60063869]
 [-0.29169375]]), (1, [[-0.23413696]
 [ 1.57921282]
 [ 0.76743473]
 [-0.46947439]
 [ 0.54256004]])]


In [150]:
def get_data(corpus, window_size=2):
    for document in corpus:
        doc_len = len(document)
        for ix in range(doc_len):
            floor = max(0, ix-window_size)
            ceil = min(doc_len, ix+window_size+1)
            
            X = document[ix]
            
            y = list()
            for word in document[floor:ix]:
                y.append(word)
            for word in document[ix+1:ceil]:
                y.append(word)
            
            yield X, y

In [217]:
class EmbeddingModel(object):
    
    def __init__(self, vocab_size, h_size, learning_rate, seed=None):
        """
        parameters
        ----------
        
        vocabulary: list of unique words
        h_size    : size of the hidden units
        """
        if seed:
            np.random.seed(seed)
        
        self.eta = learning_rate
        
        self.w1   = np.random.randn(vocab_size, h_size)
        self.root = self.build_tree(range(vocab_size), h_size)
    
    def feed_forward(self, X, labels):
        h = self.w1[None, X]
#         y = [self.sigmoid(np.dot(h, self.root[label])) for label in labels]
        
        y_preds = list()
        for label in labels:
            y_pred = list()
            for sign, node in self.root[label]:
                y_pred.append((sign, self.sigmoid(np.dot(h, node.value))))
            y_preds.append(y_pred)

        return h, y_preds
    
    def back_propagation(self, labels, y_preds, h):       
        dW1 = np.zeros_like(self.w1)
        
        for ix, (label, y_pred) in enumerate(zip(labels, y_preds)):
            for (_, y), (sign, node) in zip(y_pred, self.root[label]):
                error = y - 1 if sign == 1 else y

                dW1 += error * node.value.T

                dW2 = error * h.T
                node.value -= self.eta * dW2
        self.w1 -= self.eta * dW1
    
    def build_tree(self, leaves, h_size):
        ref_leaves = {leaf:Leaf(leaf) for leaf in leaves}
        leaves = list(ref_leaves.values())

        nodes = list()

        while True:
            if len(leaves) > 1:
                node = Node(leaves.pop(), leaves.pop(), h_size)
                nodes.append(node)
            elif len(nodes) > 1:
                node = Node(nodes.pop(), nodes.pop(), h_size)
                nodes.insert(0, node)
            else:
                node = Node(nodes.pop(), leaves.pop(), h_size)
                nodes.append(node)

            if len(leaves) == 0 and len(nodes) == 1:
                root = nodes[-1]
                root.leaves = ref_leaves
                return root
    
    def sigmoid(self, x):
        return 1./(1.+np.exp(-x))
    
model = EmbeddingModel(vocab_size, 5, 1e-3, 42)

epochs = 100000

for epoch in range(epochs):
    loss = 0
    for X, labels in get_data(corpus_preprocessed):
        h, y_preds = model.feed_forward(X, labels)

        for y_pred in y_preds:
            for sign, value in y_pred:
                if sign == 1:
                    loss += -np.sum(np.log(value))
                else:
                    loss += -np.sum(np.log(1-value))

        model.back_propagation(labels, y_preds, h)
    if epoch % 1500 == 0:
        print(loss)

94.72712578733046
51.868826718851935
49.72004112570189
48.71601029537539
48.11701027480814
47.682471542617414
47.337062294574814
47.05063036345644
46.807363622093824
46.59748789146731
46.41433740062122
46.253071061581345
46.110020175013844
45.982319975423096
45.86768405797196
45.764257281596194
45.67051493242734
45.585190804268144
45.50722426373679
45.435720301686985
45.369918777235064
45.30917035624118
45.25291744014845
45.20067888670049
45.15203765737928
45.106630753127014
45.06414095849498
45.0242900280083
44.98683303169785
44.95155363869196
44.91826016458189
44.88678224411172
44.85696801845818
44.82868174799408
44.80180177843446
44.77621880172918
44.75183436378726
44.72855957970658
44.70631402409256
44.68502476963991
44.664625551689646
44.64505604017696
44.626261203416874
44.60819075066786
44.59079864246995
44.574042659456424
44.557884021754155
44.54228705226599
44.527218878116855
44.51264916537091
44.49854988282617
44.48489509127833
44.47166075514532
44.458824573764566
44.44636583