In [3]:
import numpy as np
from numpy.linalg import norm
import random
from utils.gradcheck import gradcheck_naive
from utils.utils import normalizeRows, softmax

In [None]:
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

In [3]:
tokens['computer']

8462

In [4]:
dimVectors = 10

# Context size
C = 5

In [5]:
random.seed(31415)
np.random.seed(9265)

In [8]:
dataset = type('dummy', (), {})()

In [11]:
    def dummySampleTokenIdx():
        return random.randint(0, 4)

    def getRandomContext(C):
        tokens = ["a", "b", "c", "d", "e"]
        return tokens[random.randint(0,4)], \
            [tokens[random.randint(0,4)] for i in range(2*C)]
    dataset.sampleTokenIdx = dummySampleTokenIdx
    dataset.getRandomContext = getRandomContext

In [12]:
dataset

<__main__.dummy at 0x1172eaef0>

In [13]:
    dataset.sampleTokenIdx = dummySampleTokenIdx
    dataset.getRandomContext = getRandomContext

In [14]:
    random.seed(31415)
    np.random.seed(9265)
    dummy_vectors = normalizeRows(np.random.randn(10,3))
    dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)])

In [16]:
dummy_vectors

array([[-0.96735714, -0.02182641,  0.25247529],
       [ 0.73663029, -0.48088687, -0.47552459],
       [-0.27323645,  0.12538062,  0.95374082],
       [-0.56713774, -0.27178229, -0.77748902],
       [-0.59609459,  0.7795666 ,  0.19221644],
       [-0.6831809 , -0.04200519,  0.72904007],
       [ 0.18289107,  0.76098587, -0.62245591],
       [-0.61517874,  0.5147624 , -0.59713884],
       [-0.33867074, -0.80966534, -0.47931635],
       [-0.52629529, -0.78190408,  0.33412466]])

In [15]:
dummy_tokens

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}

In [17]:
batchsize = 50
loss = 0.0
grad = np.zeros(dummy_vectors.shape)
N = dummy_vectors.shape[0]

In [18]:
centerWordVectors = dummy_vectors[:int(N/2),:]
outsideVectors = dummy_vectors[int(N/2):,:]

In [20]:
centerWordVectors

array([[-0.96735714, -0.02182641,  0.25247529],
       [ 0.73663029, -0.48088687, -0.47552459],
       [-0.27323645,  0.12538062,  0.95374082],
       [-0.56713774, -0.27178229, -0.77748902],
       [-0.59609459,  0.7795666 ,  0.19221644]])

In [21]:
outsideVectors

array([[-0.6831809 , -0.04200519,  0.72904007],
       [ 0.18289107,  0.76098587, -0.62245591],
       [-0.61517874,  0.5147624 , -0.59713884],
       [-0.33867074, -0.80966534, -0.47931635],
       [-0.52629529, -0.78190408,  0.33412466]])

In [21]:
windowSize = 5

In [24]:
windowSize1 = random.randint(1, windowSize)

In [25]:
windowSize1

3

In [26]:
centerWord, context = dataset.getRandomContext(windowSize1)

In [27]:
centerWord

'd'

In [28]:
context

['a', 'e', 'e', 'd', 'b', 'c']

In [None]:
c, gin, gout = skipgram(
            centerWord, windowSize1, context, word2Ind, centerWordVectors,
            outsideVectors, dataset, word2vecLossAndGradient
        )

In [None]:
def naiveSoftmaxLossAndGradient(
    centerWordVec,
    outsideWordIdx,
    outsideVectors,
    dataset
):
    value = np.dot(outsideVectors, centerWordVec)

In [38]:
#skip gram as word2vecModel:
word2Ind = dummy_tokens
outsideWords = context
currentCenterWord = centerWord
#####################
loss = 0.0
gradCenterVecs = np.zeros(centerWordVectors.shape)
gradOutsideVectors = np.zeros(outsideVectors.shape)
outsideWordIndices = [word2Ind[i] for i in outsideWords]
centerWordIdx = word2Ind[currentCenterWord]
centerWordVec = centerWordVectors[centerWordIdx] 

In [46]:
#one iteration of naivesoftmaxlossandgradient
outsideWordIdx = outsideWordIndices[0]
value = np.dot(outsideVectors, centerWordVec)

In [47]:
value

array([-0.16794671,  0.17340572,  0.67325666,  0.78478886,  0.25121135])

In [48]:
y_hat  = softmax(value)

In [49]:
y_hat

array([0.11312885, 0.1591552 , 0.26236345, 0.29331965, 0.17203285])

In [50]:
loss = - np.log(y_hat[outsideWordIdx])

In [51]:
loss

2.1792278778060012

In [53]:
d_value = y_hat
d_value[outsideWordIdx] -= 1

In [54]:
d_value

array([-0.88687115,  0.1591552 ,  0.26236345,  0.29331965,  0.17203285])

In [55]:
gradCenterVec = outsideVectors.T.dot(d_value)

In [56]:
gradOutsideVecs = d_value[:, np.newaxis].dot( np.array([centerWordVec]) )

In [57]:
gradOutsideVecs

array([[ 0.5029781 ,  0.24103588,  0.68953258],
       [-0.09026292, -0.04325557, -0.12374142],
       [-0.14879621, -0.07130574, -0.2039847 ],
       [-0.16635264, -0.07971909, -0.22805281],
       [-0.09756632, -0.04675548, -0.13375365]])

In [36]:
# for outsideWordIdx in outsideWordIndices:

In [39]:
centerWordIdx

3

In [40]:
centerWordVec

array([-0.56713774, -0.27178229, -0.77748902])

In [35]:
outsideWordIndices

[0, 4, 4, 3, 1, 2]

In [31]:
gradCenterVecs

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [33]:
gradOutsideVectors

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [87]:
word2Ind = dummy_tokens
currentCenterWord = centerWord
outsideWords = context

In [88]:
centerWordInd = word2Ind[currentCenterWord]

In [89]:
centerWordInd

1

In [90]:
centerWordVec = centerWordVectors[centerWordInd]

In [91]:
centerWordVec

array([ 0.73663029, -0.48088687, -0.47552459])

In [92]:
outsideWordIndices = [word2Ind[i] for i in outsideWords]

In [93]:
outsideWordIndices

[3, 3, 3, 0, 1, 4, 0, 2]

In [94]:
outsideWordIdx = outsideWordIndices[0]

In [95]:
outsideWordIdx

3

In [108]:
outsideVectors

array([[-0.6831809 , -0.04200519,  0.72904007],
       [ 0.18289107,  0.76098587, -0.62245591],
       [-0.61517874,  0.5147624 , -0.59713884],
       [-0.33867074, -0.80966534, -0.47931635],
       [-0.52629529, -0.78190408,  0.33412466]])

In [109]:
centerWordVec

array([ 0.73663029, -0.48088687, -0.47552459])

In [96]:
value = np.dot(outsideVectors, centerWordVec)

In [97]:
value

array([-0.82972848,  0.06476808, -0.41674757,  0.36780902, -0.17056214])

In [98]:
from utils.utils import normalizeRows, softmax

In [99]:
y_hat = softmax(value)

In [111]:
y_hat

array([ 0.0980147 ,  0.23975415,  0.1481312 , -0.6753801 ,  0.18948006])

In [101]:
loss = -np.log(y_hat[outsideWordIdx])

In [102]:
loss

1.125100332007605

In [103]:
d_value = y_hat

In [104]:
d_value[outsideWordIdx] -= 1

In [105]:
d_value

array([ 0.0980147 ,  0.23975415,  0.1481312 , -0.6753801 ,  0.18948006])

In [106]:
gradCenterVec = outsideVectors.T.dot(d_value)

In [107]:
gradCenterVec

array([0.01476897, 0.6532614 , 0.22079605])

In [112]:
gradOutsideVecs = d_value[:, np.newaxis].dot( np.array([centerWordVec]) )

In [113]:
gradOutsideVecs

array([[ 0.0722006 , -0.04713398, -0.0466084 ],
       [ 0.17661017, -0.11529462, -0.11400899],
       [ 0.10911793, -0.07123435, -0.07044003],
       [-0.49750544,  0.32478142,  0.32115985],
       [ 0.13957675, -0.09111847, -0.09010243]])

In [None]:
loss

In [None]:
        one_loss, one_gradCenter, one_gradOutside = \
            word2vecLossAndGradient(centerWordVec, outsideWordIdx, outsideVectors, dataset) 

In [None]:
word2

In [None]:
word2vec_sgd_wrapper(
        skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient),
        dummy_vectors, "naiveSoftmaxLossAndGradient Gradient")

In [None]:
for i in range(batchsize):
        windowSize1 = random.randint(1, windowSize)
        centerWord, context = dataset.getRandomContext(windowSize1)

        c, gin, gout = word2vecModel(
            centerWord, windowSize1, context, word2Ind, centerWordVectors,
            outsideVectors, dataset, word2vecLossAndGradient
        )
        loss += c / batchsize
        grad[:int(N/2), :] += gin / batchsize
        grad[int(N/2):, :] += gout / batchsize

    return loss, grad

In [None]:
gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
        skipgram, dummy_tokens, vec, dataset, 5, naiveSoftmaxLossAndGradient),
        dummy_vectors, "naiveSoftmaxLossAndGradient Gradient")

In [None]:
c, gin, gout = word2vecModel(
            centerWord, windowSize1, context, word2Ind, centerWordVectors,
            outsideVectors, dataset, word2vecLossAndGradient
        )