In [101]:
# This notebook is an example to determine which sentence is positive, and also show
# the usage of Keras embedding layer

In [102]:
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Flatten


In [103]:
# define documents
docs = ['Well done!',
		'Good work',
        'Very nice',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = np.array([1,1,1,1,1,1,0,0,0,0,0])

In [104]:
tk = Tokenizer(oov_token='UNK')


In [105]:
tk.fit_on_texts(docs)
tk.index_word

{1: 'UNK',
 2: 'work',
 3: 'done',
 4: 'good',
 5: 'nice',
 6: 'effort',
 7: 'poor',
 8: 'well',
 9: 'very',
 10: 'great',
 11: 'excellent',
 12: 'weak',
 13: 'not',
 14: 'could',
 15: 'have',
 16: 'better'}

In [106]:
encoded_docs = tk.texts_to_sequences(docs)
encoded_docs

[[8, 3],
 [4, 2],
 [9, 5],
 [10, 6],
 [5, 2],
 [11],
 [12],
 [7, 6],
 [13, 4],
 [7, 2],
 [14, 15, 3, 16]]

In [107]:
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[ 8  3  0  0]
 [ 4  2  0  0]
 [ 9  5  0  0]
 [10  6  0  0]
 [ 5  2  0  0]
 [11  0  0  0]
 [12  0  0  0]
 [ 7  6  0  0]
 [13  4  0  0]
 [ 7  2  0  0]
 [14 15  3 16]]


In [108]:
# define the model

# the index of input starts from 0 in Keras embedding layer
vocab_size = len(tk.index_word) + 1

model = Sequential()
embed = Embedding(vocab_size, output_dim=8, input_length=max_length)
model.add(embed)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 4, 8)              136       
_________________________________________________________________
flatten_5 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 169
Trainable params: 169
Non-trainable params: 0
_________________________________________________________________
None


In [109]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=1)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 72.727275


In [110]:
test_docs = ['very good',
		     'very poor']

test_encoded_docs = tk.texts_to_sequences(test_docs)
test_encoded_docs

[[9, 4], [9, 7]]

In [111]:
test_padded_docs = pad_sequences(test_encoded_docs, maxlen=max_length, padding='post')
print(test_padded_docs)

[[9 4 0 0]
 [9 7 0 0]]


In [112]:
y_pred = model.predict(test_padded_docs)
y_pred


array([[0.55882627],
       [0.57063407]], dtype=float32)

In [113]:
embed.get_weights()

[array([[-2.8333687e-03, -3.3480845e-02,  5.5552438e-02, -5.4558508e-02,
         -4.2722899e-02, -2.3858266e-02, -6.0415097e-02, -6.7735694e-02],
        [ 2.5708426e-02,  1.7102096e-02, -3.8722903e-04,  2.5692847e-02,
          1.1156999e-02, -7.1636215e-03,  7.6245442e-03,  3.9116964e-03],
        [ 2.1070907e-02,  3.1960603e-02,  6.6141345e-02, -4.2721730e-02,
         -8.4666803e-02,  7.7068023e-02, -6.7876622e-02,  7.0303731e-02],
        [ 6.5865941e-02,  6.9620982e-02,  6.6943698e-02, -3.2968737e-02,
         -7.6907150e-02,  2.2538776e-02,  3.6535215e-02,  9.5670037e-02],
        [ 2.5622102e-02, -5.4880779e-02,  7.3884182e-02,  4.0044077e-05,
          6.1757363e-02, -2.7064726e-02,  1.7423810e-02, -5.9651185e-02],
        [ 1.7818743e-02, -6.3986517e-02,  6.8396576e-02, -2.4390290e-02,
         -9.1411725e-02,  4.1740272e-02, -5.9658948e-02, -9.7657725e-02],
        [-7.0328131e-02, -6.8728991e-02, -1.0117333e-01,  2.6476542e-02,
          9.6251450e-02, -8.5902726e-03,  8.4

In [114]:
# embedding layer is a matrix with vocab_size rows and output_dim columns
print(embed.get_weights()[0].shape)

(17, 8)
