<a href="https://colab.research.google.com/github/Kaiziferr/NLP_Workshop/blob/master/embedding/02_workshop_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

from keras.models import Sequential
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Embedding

In [2]:
docs = [
        "The new one's twice as big as the old one.",
        "I'll make a statement.",
        "The train leaves in ten minutes.",
        "The clock was ticking and kept me awake all night.",
        "He broke his promise, which was a big mistake.",
        "My biggest problem is deciding what I should do next.",
]

# **Tokenizer**
---



In [3]:
# Primera forma
tokenizer_tokenizer = Tokenizer()
tokenizer_tokenizer.fit_on_texts(docs)


# Definir el vocabulario
vocab_size = len(tokenizer_tokenizer.word_index)+1
vocab_size

43

In [4]:
# word_index
print(tokenizer_tokenizer.word_index)

{'the': 1, 'as': 2, 'big': 3, 'a': 4, 'was': 5, 'new': 6, "one's": 7, 'twice': 8, 'old': 9, 'one': 10, "i'll": 11, 'make': 12, 'statement': 13, 'train': 14, 'leaves': 15, 'in': 16, 'ten': 17, 'minutes': 18, 'clock': 19, 'ticking': 20, 'and': 21, 'kept': 22, 'me': 23, 'awake': 24, 'all': 25, 'night': 26, 'he': 27, 'broke': 28, 'his': 29, 'promise': 30, 'which': 31, 'mistake': 32, 'my': 33, 'biggest': 34, 'problem': 35, 'is': 36, 'deciding': 37, 'what': 38, 'i': 39, 'should': 40, 'do': 41, 'next': 42}


# **Pad Secuencia**
---



In [5]:
encode = tokenizer_tokenizer.texts_to_sequences(docs)
print(encode)

[[1, 6, 7, 8, 2, 3, 2, 1, 9, 10], [11, 12, 4, 13], [1, 14, 15, 16, 17, 18], [1, 19, 5, 20, 21, 22, 23, 24, 25, 26], [27, 28, 29, 30, 31, 5, 4, 3, 32], [33, 34, 35, 36, 37, 38, 39, 40, 41, 42]]


In [6]:
# Definir el maximo de la secuencia
max_length = 10
padded_docs = pad_sequences(encode, maxlen = max_length, padding='post', truncating='post')
print(padded_docs)

[[ 1  6  7  8  2  3  2  1  9 10]
 [11 12  4 13  0  0  0  0  0  0]
 [ 1 14 15 16 17 18  0  0  0  0]
 [ 1 19  5 20 21 22 23 24 25 26]
 [27 28 29 30 31  5  4  3 32  0]
 [33 34 35 36 37 38 39 40 41 42]]


# **Model**
---



In [7]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim = 10, input_length=max_length))
model.compile(loss='mse', optimizer = 'adam')

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 10)            430       
                                                                 
Total params: 430
Trainable params: 430
Non-trainable params: 0
_________________________________________________________________


In [9]:
predict = model.predict(padded_docs)
predict

array([[[-1.79401524e-02, -3.48904841e-02, -1.41974539e-03,
         -1.71018355e-02, -9.99679416e-03,  4.71910872e-02,
         -3.90560515e-02, -3.35182920e-02, -4.25909422e-02,
         -4.09699790e-02],
        [ 1.50004290e-02,  1.64554976e-02,  1.60534494e-02,
         -2.21569296e-02,  4.75927442e-03, -5.36174700e-03,
          3.90311368e-02, -2.44204048e-02,  4.67585661e-02,
          3.74079458e-02],
        [-9.21798870e-03,  7.50976801e-03, -1.57356039e-02,
          2.57200040e-02, -4.71609719e-02,  2.22511031e-02,
         -4.59185839e-02, -7.05204904e-04,  3.29685323e-02,
          3.19948457e-02],
        [-3.01219225e-02,  1.04899518e-02, -4.02530916e-02,
         -1.05445608e-02,  2.68332995e-02, -4.01141755e-02,
          1.79869570e-02,  2.48603337e-02, -2.17092279e-02,
         -2.53077745e-02],
        [-8.00902769e-03, -2.72447821e-02, -3.44398506e-02,
          4.30012383e-02, -4.01211977e-02, -3.46414745e-05,
          3.29065360e-02,  4.06569354e-02, -6.180845