<a href="https://colab.research.google.com/github/Kaiziferr/NLP_Workshop/blob/master/embedding/02_workshop_embedding_glove.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

El presente proyecto tiene como finalidad el desarrollo de la incrustración de palabras con keras usando una incrustración previa, a travez de un ejercicio de análisis de sentimiento basico. El tutorial fue tomado de https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [1]:
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding

In [2]:
docs = ['Well done!','Good work','Great effort','nice work','Excellent!','killer','Poor effort!',
		'not good', 'poor work', 'Could have done better.']

labels = np.array([1,1,1,1,1,0,0,0,0,0])

# **Tokenizer**
---



In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)

vocab_size = len(tokenizer.word_index) + 1
vocab_size

15

In [4]:
tokenizer.word_index

{'better': 14,
 'could': 12,
 'done': 2,
 'effort': 4,
 'excellent': 9,
 'good': 3,
 'great': 7,
 'have': 13,
 'killer': 10,
 'nice': 8,
 'not': 11,
 'poor': 5,
 'well': 6,
 'work': 1}

# **Secuencia**
---



In [5]:
encoded = tokenizer.texts_to_sequences(docs)
encoded

[[6, 2],
 [3, 1],
 [7, 4],
 [8, 1],
 [9],
 [10],
 [5, 4],
 [11, 3],
 [5, 1],
 [12, 13, 2, 14]]

In [6]:
max_length = 4
padded_sec = pad_sequences(encoded, maxlen=max_length, padding='post')
print(padded_sec)

[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


# **Glove**
---


cargar todo el archivo de incrustación de palabras de GloVe en la memoria

In [7]:
embedding_glove = dict()
with open('./data/glove.txt', 'r', encoding='cp1252') as file:
  for line in file:
    value = line.split()
    word = value[0]
    coef = np.array(value[1:], dtype="float32")
    embedding_glove[word] = coef
print('loaded %s word vectors' % len(embedding_glove))

loaded 3669 word vectors


In [8]:
embedding_matrix = np.zeros((vocab_size, 100))
for i, word in tokenizer.index_word.items():
  embedding_vector = embedding_glove.get(word)
  print(embedding_vector)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

None
None
None
None
None
None
None
None
None
[ 0.62503   0.45228   0.53691  -0.52622   0.1173    0.25047   0.47217
 -0.64581  -0.14799  -0.16269  -0.34152   0.66793  -0.10531   0.34373
  0.80224   0.89863   0.066893  0.24999  -0.39885   0.85964   0.38381
 -0.66396  -0.072117  0.031933  0.54283   1.2385    0.044234 -0.21744
 -0.69443   0.11536   0.58598   0.52942   0.35977   0.69819   0.36275
 -0.89167  -0.50746  -1.0164    0.93345  -0.19236  -0.46195   0.70799
  0.25584  -0.24226   0.65123   0.60369  -0.051505 -0.50738   0.06415
  0.35843  -0.24853  -0.28729   0.5789    0.96343  -0.13954  -1.0618
 -0.37663   0.22548   0.74441   0.37475  -0.01772   1.3129   -0.38783
 -0.73654   1.1258    0.0706   -0.092238  0.42428   0.14767   0.49282
 -0.33878  -0.88214  -0.2196    0.076745 -0.12668   0.29099   0.53392
 -0.27073   0.53188  -0.004534  1.098     0.39016  -0.55458   0.038508
 -0.9452   -0.54092  -0.050421  0.28469  -0.6706    0.046016  0.2562
 -0.08889   0.49196   0.13846   0.42052  -0.21

# **Model**
---



In [9]:
model = Sequential()
e = Embedding(vocab_size, 100, weights = embedding_matrix, input_length = max_length ,trainable = False)
model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))

In [10]:
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
model.fit(padded_sec, labels, epochs = 50, verbose = 0)

<keras.callbacks.History at 0x7f3891fa5e90>

In [12]:
loss, accuracy = model.evaluate(padded_sec, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 50.000000


In [13]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 4)                 0         
                                                                 
 dense (Dense)               (None, 1)                 5         
                                                                 
Total params: 5
Trainable params: 5
Non-trainable params: 0
_________________________________________________________________
None
