<a href="https://colab.research.google.com/github/Kaiziferr/NLP_Workshop/blob/master/word_mbeddings/01_workshop_basic_embedding_keras_with_a_single_number.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding

from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences

import numpy as np

In [2]:
docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!', 'Weak', 'Poor effort!', 'not good', 'poor work', 'Could have done better.']
labels = np.array([1,1,1,1,1,0,0,0,0,0])
vocab_size = 50

In [3]:
# One-hot codifica un texto en una lista de índices de palabras de tamaño n
encoded_docs = [one_hot(word, vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ') for word in docs]
encoded_docs

[[4, 3],
 [19, 32],
 [14, 35],
 [21, 32],
 [25],
 [8],
 [3, 35],
 [44, 19],
 [3, 32],
 [6, 31, 3, 17]]

In [4]:
# pad_sequences transforma una lista de secuencias en una matriz de forma 2D Numpy a una misma longitud. Si la secuencia es menor, se rellena con algun valor, en caso de lo contrario se trunca.
sequence = [[1], [2, 3], [4, 5 ,6], [7, 8, 9, 10]]
pad_sequences(sequence, padding='post', maxlen=7, dtype='float64', truncating='pre', value = 0)

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 2.,  3.,  0.,  0.,  0.,  0.,  0.],
       [ 4.,  5.,  6.,  0.,  0.,  0.,  0.],
       [ 7.,  8.,  9., 10.,  0.,  0.,  0.]])

In [5]:
max_length = 4
paddec_doc = pad_sequences(encoded_docs, maxlen=max_length, dtype='int64', padding='post', truncating='pre', value=0)
paddec_doc

array([[ 4,  3,  0,  0],
       [19, 32,  0,  0],
       [14, 35,  0,  0],
       [21, 32,  0,  0],
       [25,  0,  0,  0],
       [ 8,  0,  0,  0],
       [ 3, 35,  0,  0],
       [44, 19,  0,  0],
       [ 3, 32,  0,  0],
       [ 6, 31,  3, 17]])

In [6]:
def model_base():
  model = Sequential()
  model.add(Embedding(input_dim=vocab_size, output_dim=8, input_length=max_length, embeddings_initializer='uniform'))
  model.add(Flatten())
  model.add(Dense(1, activation='sigmoid', kernel_initializer='glorot_uniform'))
  model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
  return model

In [7]:
model = model_base()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 8)              400       
_________________________________________________________________
flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________


In [8]:
model.fit(paddec_doc, labels, epochs=50, verbose=0)

<keras.callbacks.History at 0x7fbb6a016e90>

In [15]:
loss, accuracy = model.evaluate(paddec_doc, labels,verbose=0)
print('Accuracy %f' % (accuracy*100))

Accuracy 89.999998


In [20]:
# Prediction
docs_valide = ['Well good']
input_valide_doc = [one_hot(word, vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ') for word in docs_valide]
sec_valide_doc = pad_sequences(input_valide_doc, maxlen=4, padding='post')
sec_valide_doc

array([[ 4, 19,  0,  0]], dtype=int32)

In [21]:
model.predict(sec_valide_doc)

array([[0.5573792]], dtype=float32)

Tokenizer

Keras proporciona una clase Tokenizer que se puede ajustar a los datos de entrenamiento, puede convertir texto en secuencias de manera consistente llamando al método texts_to_sequences () en la clase Tokenizer , y brinda acceso al mapeo del diccionario de palabras a enteros en un atributo word_index . Vectorizar un corpus de texto,

In [35]:
tokenizar = Tokenizer(num_words = None, split=' ', char_level=False, oov_token=None)
tokenizar.fit_on_texts(docs)
tokenizar.word_index

{'better': 14,
 'could': 12,
 'done': 2,
 'effort': 4,
 'excellent': 9,
 'good': 3,
 'great': 7,
 'have': 13,
 'nice': 8,
 'not': 11,
 'poor': 5,
 'weak': 10,
 'well': 6,
 'work': 1}

In [38]:
# el + 1, probablemente sera por que hay que tener encuenta el cero
vocab_size = len(tokenizar.word_index) + 1
vocab_size

15

In [40]:
sequence_doc = tokenizar.texts_to_sequences(docs)
sequence_doc

[[6, 2],
 [3, 1],
 [7, 4],
 [8, 1],
 [9],
 [10],
 [5, 4],
 [11, 3],
 [5, 1],
 [12, 13, 2, 14]]

In [49]:
tokenizar.get_config()

{'char_level': False,
 'document_count': 10,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'index_docs': '{"6": 1, "2": 2, "3": 2, "1": 3, "7": 1, "4": 2, "8": 1, "9": 1, "10": 1, "5": 2, "11": 1, "12": 1, "13": 1, "14": 1}',
 'index_word': '{"1": "work", "2": "done", "3": "good", "4": "effort", "5": "poor", "6": "well", "7": "great", "8": "nice", "9": "excellent", "10": "weak", "11": "not", "12": "could", "13": "have", "14": "better"}',
 'lower': True,
 'num_words': None,
 'oov_token': None,
 'split': ' ',
 'word_counts': '{"well": 1, "done": 2, "good": 2, "work": 3, "great": 1, "effort": 2, "nice": 1, "excellent": 1, "weak": 1, "poor": 2, "not": 1, "could": 1, "have": 1, "better": 1}',
 'word_docs': '{"well": 1, "done": 2, "good": 2, "work": 3, "great": 1, "effort": 2, "nice": 1, "excellent": 1, "weak": 1, "poor": 2, "not": 1, "could": 1, "have": 1, "better": 1}',
 'word_index': '{"work": 1, "done": 2, "good": 3, "effort": 4, "poor": 5, "well": 6, "great": 7, "nice": 8, "excel

In [50]:
max_length = 4
pad_sequences = pad_sequences(sequence_doc, maxlen=max_length, padding='post')
print(pad_sequences)

[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]
