In [179]:
import numpy as np
import re
import random

In [180]:
random.seed(30)

In [181]:
def tokenize(text):
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())

In [182]:
def mapping(tokens):
    word_to_id = dict()
    id_to_word = dict()
    for i,word in enumerate(set(tokens)):
        word_to_id[word] = i
        id_to_word[i] = word
    return word_to_id, id_to_word

In [183]:
doc = "After the deduction of the costs of investing, " \
      "beating the stock market is a loser's game."
tokens = tokenize(doc)
print(tokens)

['after', 'the', 'deduction', 'of', 'the', 'costs', 'of', 'investing', 'beating', 'the', 'stock', 'market', 'is', 'a', "loser's", 'game']


In [184]:
word_to_id, id_to_word = mapping(tokens)
print(set(tokens))
print(id_to_word[0])

{'the', 'market', 'beating', 'deduction', 'after', 'stock', 'a', "loser's", 'is', 'investing', 'game', 'costs', 'of'}
the


In [185]:
#under stand code
# def generate_training_date(tokens, id_to_word, window_size):
#     L = len(tokens)
#     X, Y = [], []
#     for i in range(L):
#         index_before_after = list(range(max(0,i-window_size,i))) + list(range(i+1, min(i+window_size+1,L)))
#         for j in index_before_after:
#             X.append(id_to_word[i])
#             Y.append(id_to_word[j])
#     return X,Y

In [186]:
#X,Y = generate_training_date(tokens, id_to_word, 3)

In [187]:
#print(X[:10])
#print("------------------------------------------------------")
#print(Y[:10])

In [188]:
#real code
def generate_training_date(tokens, word_to_id, window_size):
    L = len(tokens)
    X, Y = [], []
    for i in range(L):
        index_before_after = list(range(max(0, i - window_size), i)) + \
                             list(range(i + 1, min(i + window_size + 1,L)))
        #print(index_before_after)
        for j in index_before_after:
            X.append(word_to_id[tokens[i]])
            Y.append(word_to_id[tokens[j]])
    X = np.array(X)
    X = np.expand_dims(X, axis=0)
    Y = np.array(Y)
    Y = np.expand_dims(Y, axis=0)       
    return X,Y

In [189]:
X,Y = generate_training_date(tokens, word_to_id, 3)
print(X.shape)
print(Y.shape)

(1, 84)
(1, 84)


In [190]:
vocab_size = len(id_to_word)
m = Y.shape[1]
Y_one_hot = np.zeros((vocab_size,m))
Y_one_hot[Y.flatten(), np.arange(m)] = 1
#Y_one_hot[84 cua y flatten, 0 -> 84]
print(Y.shape)
print(Y_one_hot.shape)
print(Y_one_hot[:,1])

(1, 84)
(13, 84)
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [206]:
def initialize_wrd_ebd(vocab_size, embed_size):
    random.seed(30)
    return np.random.randn(vocab_size, embed_size) * 0.01

def initialize_dense(input_size, output_size):
    random.seed(30)
    return np.random.randn(input_size,output_size) * 0.01

def initialize_parameters(vocab_size, embed_size):
    embedded_layer = initialize_wrd_ebd(vocab_size, embed_size)
    dense_layer = initialize_dense(vocab_size, embed_size)
    
    parameters = {}
    parameters['emb'] = embedded_layer
    parameters['W'] = dense_layer
    return parameters
    

In [207]:
def input_to_word_vecs(X, parameters):
    m = X.shape[1]
    embeded_matrix = parameters['emb']
    # X shape (1, 84)
    # embeded_matrix (13, 50)
    word_vectors = embeded_matrix[X.flatten(),:].T
    return word_vectors
    #print(word_vectors.shape) 50 84

In [208]:
#parameters = initialize_parameter(vocab_size, 50)
#word_vectors = input_to_word_vecs(X,parameters)

In [209]:
def linear_dense(word_vectors, parameters):
    W = parameters['W']
    #print(W.shape) 13 50
    Z = np.dot(W, word_vectors)
    #print(Z.shape) 13 84
    return W, Z

In [210]:
#linear_dense(word_vectors, parameters)

In [211]:
def softmax(Z):
    """
    Z: output out of the dense layer. shape: (vocab_size, m)
    """
    softmax_out = np.divide(np.exp(Z), np.sum(np.exp(Z), axis=0, keepdims=True) + 0.001)
    assert(softmax_out.shape == Z.shape)

    return softmax_out

In [212]:
def foward_prop(X, parameters):
    word_vectors = input_to_word_vecs(X, parameters)
    W, Z = linear_dense(word_vectors, parameters)
    softmax_out = softmax(Z)
    
    caches = {}
    caches['X'] = X
    caches['word_vec'] = word_vectors
    caches['W'] = W
    caches['Z'] = Z
    return softmax_out, caches

In [221]:
paramaters = initialize_parameters(vocab_size, 50)
softmax_out, caches = foward_prop(X, paramaters)