## Word2vec from Scratch
Reinventing a wheel is usually an awesome way to learn something deeply

In [168]:
import re
import numpy as np

def tokenize(text):
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())

def mapping(tokens):
    word_to_id = dict()
    id_to_word = dict()

    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token

    return word_to_id, id_to_word

def generate_training_data(tokens, word_to_id, window_size):
    N = len(tokens)  #N=16
    X, Y = [], []
    t = 0
    for i in range(N): #i=0-15
        nbr_inds = list(range(max(0, i - window_size), i)) + \
                   list(range(i + 1, min(N, i + window_size + 1)))
#        print('i=',i,'t=',t)
        for j in nbr_inds:
            X.append(word_to_id[tokens[i]])  #the central words
            Y.append(word_to_id[tokens[j]])  #the outside words (in windows)
            t = t + 1
            
    X = np.array(X)
    X = np.expand_dims(X, axis=0)
    Y = np.array(Y)
    Y = np.expand_dims(Y, axis=0)
#    print(X.shape,Y.shape, 't=', t)

    return X, Y

In [169]:
doc = "After the deduction of the costs of investing, " \
      "beating the stock market is a loser's game."
#doc = "After the deduction of the costs" 

tokens = tokenize(doc)
print(tokens)
print()

#import nltk
#tokens = nltk.word_tokenize(doc)
#print(tokens)

print([i for i in range(0)])

['after', 'the', 'deduction', 'of', 'the', 'costs', 'of', 'investing', 'beating', 'the', 'stock', 'market', 'is', 'a', "loser's", 'game']

[]


In [170]:
word_to_id, id_to_word = mapping(tokens)
print(word_to_id)
print()
print(id_to_word)

{"loser's": 0, 'stock': 1, 'market': 3, 'beating': 2, 'costs': 4, 'of': 5, 'after': 6, 'the': 7, 'deduction': 8, 'a': 9, 'investing': 10, 'is': 11, 'game': 12}

{0: "loser's", 1: 'stock', 2: 'beating', 3: 'market', 4: 'costs', 5: 'of', 6: 'after', 7: 'the', 8: 'deduction', 9: 'a', 10: 'investing', 11: 'is', 12: 'game'}


In [180]:
window_size = 1
X, Y = generate_training_data(tokens, word_to_id, window_size)
print(X.shape,Y.shape)
vocab_size = len(id_to_word)
m = Y.shape[1]
# turn Y into one hot encoding
Y_one_hot = np.zeros((vocab_size, m))
Y_one_hot[Y.flatten(), np.arange(m)] = 1

print(Y.flatten())

print(Y_one_hot.astype(int)) 

(1, 30) (1, 30)
[ 7  6  8  7  5  8  7  5  4  7  5  4 10  5  2 10  7  2  1  7  3  1 11  3
  9 11  0  9 12  0]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]]


In [181]:
a = np.zeros((3,3))
b=[2,2,2]
b=np.array(b)
print(b.flatten())
print(np.arange(3))
a[b.flatten(),np.arange(3)]=1
a

[2 2 2]
[0 1 2]


array([[0., 0., 0.],
       [0., 0., 0.],
       [1., 1., 1.]])

In [173]:
i=0
N = len(tokens)
print('N =',N)
nbr_inds = list(range(max(0, i - 3), i)) + list(range(i + 1, min(N, i + 3 + 1)))
print(list(range(max(0, i - 3), i)))
print(list(range(i + 1, min(N, i + 3 + 1))))
print(nbr_inds)

N = 16
[]
[1, 2, 3]
[1, 2, 3]


In [182]:
print(X.shape,X)
print()
print(Y.shape,Y)
print(Y_one_hot.shape)

(1, 30) [[ 6  7  7  8  8  5  5  7  7  4  4  5  5 10 10  2  2  7  7  1  1  3  3 11
  11  9  9  0  0 12]]

(1, 30) [[ 7  6  8  7  5  8  7  5  4  7  5  4 10  5  2 10  7  2  1  7  3  1 11  3
   9 11  0  9 12  0]]
(13, 30)


In [175]:
def initialize_wrd_emb(vocab_size, emb_size):
    """
    vocab_size: int. vocabulary size of your corpus or training data
    emb_size: int. word embedding size. How many dimensions to represent each vocabulary
    """
    WRD_EMB = np.random.randn(vocab_size, emb_size) * 0.01
    return WRD_EMB

def initialize_dense(input_size, output_size):
    """
    input_size: int. size of the input to the dense layer
    output_szie: int. size of the output out of the dense layer
    """
    W = np.random.randn(output_size, input_size) * 0.01
    return W

def initialize_parameters(vocab_size, emb_size):
    """
    initialize all the trianing parameters
    """
    WRD_EMB = initialize_wrd_emb(vocab_size, emb_size)
    W = initialize_dense(emb_size, vocab_size)
    
    parameters = {}
    parameters['WRD_EMB'] = WRD_EMB
    parameters['W'] = W
    
    return parameters

In [185]:
print(initialize_wrd_emb(13,3))
print(initialize_dense(3,13))

[[-0.00273695 -0.00494449  0.00817263]
 [-0.00205551 -0.0055717  -0.00473694]
 [-0.01143236  0.00482378  0.00092643]
 [ 0.00542296 -0.01913017  0.00971176]
 [ 0.01895795  0.00517051 -0.01105913]
 [ 0.0072756  -0.0049198  -0.01168428]
 [-0.00346664 -0.00532906  0.01203248]
 [-0.00046539  0.0194206   0.00227028]
 [-0.00027444  0.01185689 -0.00737148]
 [ 0.00459522 -0.0020205  -0.01155637]
 [ 0.00834348  0.00158722 -0.00495168]
 [ 0.00573648  0.00836717  0.01135202]
 [ 0.00183763 -0.02149874  0.00974983]]
[[ 2.43272299e-03 -1.35478807e-03  9.20326530e-03]
 [-1.46032421e-02  1.72006540e-02  8.02709283e-03]
 [ 2.22381606e-02 -4.21840274e-03  1.55464387e-03]
 [-6.69747123e-03  2.20050292e-02  2.75063957e-03]
 [ 6.56206921e-03  8.68914402e-03 -4.57766065e-03]
 [-3.88922049e-03 -2.73560117e-03 -7.32916089e-03]
 [ 4.97082619e-04  1.26166799e-02  1.19434427e-02]
 [ 2.17634713e-03  9.87780373e-04  1.14959767e-02]
 [ 1.03126429e-02 -3.36350746e-03  2.79422485e-03]
 [-3.24321882e-03 -9.47475532e-03

In [177]:
def ind_to_word_vecs(inds, parameters):
    """
    inds: numpy array. shape: (1, m)
    parameters: dict. weights to be trained
    """
    m = inds.shape[1]
    WRD_EMB = parameters['WRD_EMB']
    word_vec = WRD_EMB[inds.flatten(), :].T
    
    assert(word_vec.shape == (WRD_EMB.shape[1], m))
    
    return word_vec

def linear_dense(word_vec, parameters):
    """
    word_vec: numpy array. shape: (emb_size, m)
    parameters: dict. weights to be trained
    """
    m = word_vec.shape[1]
    W = parameters['W']
    Z = np.dot(W, word_vec)
    
    assert(Z.shape == (W.shape[0], m))
    
    return W, Z

def softmax(Z):
    """
    Z: output out of the dense layer. shape: (vocab_size, m)
    """
    softmax_out = np.divide(np.exp(Z), np.sum(np.exp(Z), axis=0, keepdims=True) + 0.001)
    
    assert(softmax_out.shape == Z.shape)

    return softmax_out

def forward_propagation(inds, parameters):
    word_vec = ind_to_word_vecs(inds, parameters)
    W, Z = linear_dense(word_vec, parameters)
    softmax_out = softmax(Z)
    
    caches = {}
    caches['inds'] = inds
    caches['word_vec'] = word_vec
    caches['W'] = W
    caches['Z'] = Z
    
    return softmax_out, caches