In [49]:
import numpy as np
import re

In [50]:
def tokenize(text):
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())

In [51]:
def mapping(tokens):
    word_to_id = dict()
    id_to_word = dict()
    for i,word in enumerate(set(tokens)):
        word_to_id[word] = i
        id_to_word[i] = word
    return word_to_id, id_to_word

In [52]:
doc = "After the deduction of the costs of investing, " \
      "beating the stock market is a loser's game."
tokens = tokenize(doc)
print(tokens)

['after', 'the', 'deduction', 'of', 'the', 'costs', 'of', 'investing', 'beating', 'the', 'stock', 'market', 'is', 'a', "loser's", 'game']


In [53]:
word_to_id, id_to_word = mapping(tokens)
print(set(tokens))
print(id_to_word[0])

{'the', 'market', 'beating', 'deduction', 'after', 'stock', 'a', "loser's", 'is', 'investing', 'game', 'costs', 'of'}
the


In [54]:
#under stand code
# def generate_training_date(tokens, id_to_word, window_size):
#     L = len(tokens)
#     X, Y = [], []
#     for i in range(L):
#         index_before_after = list(range(max(0,i-window_size,i))) + list(range(i+1, min(i+window_size+1,L)))
#         for j in index_before_after:
#             X.append(id_to_word[i])
#             Y.append(id_to_word[j])
#     return X,Y

In [55]:
X,Y = generate_training_date(tokens, id_to_word, 3)

KeyError: 'after'

In [56]:
print(X[:10])
print("------------------------------------------------------")
print(Y[:10])

[[ 0  0  0  9  9  9  9  2  2  2  2  2  6  6  6  6  6  6  9  9  9  9  9  9
   5  5  5  5  5  5  6  6  6  6  6  6  7  7  7  7  7  7  8  8  8  8  8  8
   9  9  9  9  9  9 10 10 10 10 10 10 11 11 11 11 11 11 12 12 12 12 12 12
  13 13 13 13 13 14 14 14 14 15 15 15]]
------------------------------------------------------
[[ 9  2  6  0  2  6  9  0  9  6  9  5  0  9  2  9  5  6  9  2  6  5  6  7
   2  6  9  6  7  8  6  9  5  7  8  9  9  5  6  8  9 10  5  6  7  9 10 11
   6  7  8 10 11 12  7  8  9 11 12 13  8  9 10 12 13 14  9 10 11 13 14 15
  10 11 12 14 15 11 12 13 15 12 13 14]]


In [57]:
#real code
def generate_training_date(tokens, word_to_id, window_size):
    L = len(tokens)
    X, Y = [], []
    for i in range(L):
        index_before_after = list(range(max(0, i - window_size), i)) + \
                             list(range(i + 1, min(i + window_size + 1,L)))
        #print(index_before_after)
        for j in index_before_after:
            X.append(word_to_id[tokens[i]])
            Y.append(word_to_id[tokens[j]])
    X = np.array(X)
    X = np.expand_dims(X, axis=0)
    Y = np.array(Y)
    Y = np.expand_dims(Y, axis=0)       
    return X,Y

In [58]:
X,Y = generate_training_date(tokens, word_to_id, 3)
print(X.shape)
print(Y.shape)

(1, 84)
(1, 84)


In [79]:
vocab_size = len(id_to_word)
m = Y.shape[1]
Y_one_hot = np.zeros((vocab_size,m))
Y_one_hot[Y.flatten(), np.arange(m)] = 1
#Y_one_hot[84 cua y flatten, 0 -> 84]
print(Y.shape)
print(Y_one_hot.shape)
print(Y_one_hot[:,1])

(1, 84)
(13, 84)
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
