In [1]:
import numpy as np
import pandas as pd

from collections import Counter

In [2]:
corpus = ['the quick brown fox jumps over the lazy dog'.split(' ')]

words = Counter()
for doc in corpus:
    for word in doc:
        words[word] += 1

vocab_size = len(words)
print(f'vocab size: {vocab_size}')

word_to_ix, ix_to_word = {}, {}
for ix, (word, _) in enumerate(words.most_common()):
    word_to_ix[word] = ix
    ix_to_word[ix] = word

vocab size: 8


In [3]:
def softmax(arr):
    arr = np.exp(arr - np.max(arr))
    return arr / np.sum(arr)

In [4]:
def data(corpus, window_size):
    for doc in corpus:
        size = len(doc)
        for ix in range(size):
            floor = max(0, ix-window_size)
            ceil = min(size, ix+window_size+1)
            
            X = word_to_ix[doc[ix]]
            y = []
            for word in doc[floor:ix]:
                y.append(word_to_ix[word])
            for word in doc[ix+1:ceil]:
                y.append(word_to_ix[word])
            
            yield X, y

for X, y in data(corpus, 2):
    print(X, y)
    break

0 [1, 2]


In [5]:
np.random.seed(42)

epochs = 5000
h_size = 5
window_size = 2

learning_rate = 1e-2

W1 = np.random.randn(vocab_size, h_size) * .5
W2 = np.random.randn(h_size, vocab_size) * .5

for epoch in range(epochs):
    loss = 0
    for X, labels in data(corpus, window_size):
        h = W1[None, X]
        u = np.dot(h, W2)
        y = softmax(u)

        loss += -np.sum([u[0, ix] for ix in labels])
        loss += len(labels) * np.log(np.sum(np.exp(u)))

        dy = np.copy(y) * len(labels)
        for label in labels:
            dy[0,label] -= 1

        dW2 = np.dot(h.T, dy)
        dW1 = np.dot(dy, W2.T)

        W2 -= learning_rate * dW2
        W1[None, X] -= learning_rate * dW1

    if (epoch+1)%100==0:
        print(loss)

47.41546806410408
43.385156057505796
42.30235768641539
41.83498249708473
41.58653655221668
41.4394613700056
41.34647074863293
41.28453773724881
41.241316299007835
41.209854144264696
41.18609048863718
41.167562615393805
41.1527193211022
41.14054806693887
41.13036623605188
41.1217002133683
41.11421292100189
41.10765898231263
41.101856155173564
41.09666663993935
41.091984550681524
41.08772733274043
41.08382976662949
41.080239703356895
41.076914981496635
41.07382116524366
41.070929862143444
41.06821745625064
41.065664143116095
41.063253186855576
41.060970342546184
41.05880340304191
41.05674184037102
41.054776519711766
41.0528994695523
41.051103695702395
41.049383029795656
41.047732005116856
41.04614575422486
41.04461992407361
41.043150605265645
41.04173427278696
41.04036773611976
41.039048097055336
41.03777271386076
41.036539170713574
41.03534525152386
41.03418891742601
41.03306828735341
41.031981621212395


In [6]:
cols = list(word_to_ix.keys())

lazy = np.zeros((1, vocab_size))
lazy[0, 6] = 1

pd.DataFrame(softmax(np.dot(np.dot(lazy, W1), W2)), columns=cols)

Unnamed: 0,the,quick,brown,fox,jumps,over,lazy,dog
0,0.33306,4e-06,0.000584,0.000174,1.49631e-07,0.333118,0.000326,0.332734
