In [24]:
from numpy import array
from numpy import asarray
from numpy import zeros
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

In [25]:
# Prepare Glove File
def readGloveFile(gloveFile):
    with open(gloveFile, 'r') as f:
        wordToGlove = {}  # map from a token (word) to a Glove embedding vector
        wordToIndex = {}  # map from a token to an index
        indexToWord = {}  # map from an index to a token 

        for line in f:
            record = line.strip().split()
            token = record[0] # take the token (word) from the text line
            wordToGlove[token] = np.array(record[1:], dtype=np.float64) # associate the Glove embedding vector to a that token (word)

        tokens = sorted(wordToGlove.keys())
        for idx, tok in enumerate(tokens):
            kerasIdx = idx + 1  # 0 is reserved for masking in Keras (see above)
            wordToIndex[tok] = kerasIdx # associate an index to a token (word)
            indexToWord[kerasIdx] = tok # associate a word to a token (word). Note: inverse of dictionary above

    return wordToIndex, indexToWord, wordToGlove

In [26]:
# Create Pretrained Keras Embedding Layer
def createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, isTrainable):
    vocabLen = len(wordToIndex) + 1  # adding 1 to account for masking
    embDim = next(iter(wordToGlove.values())).shape[0]  # works with any glove dimensions (e.g. 50)

    embeddingMatrix = np.zeros((vocabLen, embDim))  # initialize with zeros
    for word, index in wordToIndex.items():
        embeddingMatrix[index, :] = wordToGlove[word] # create embedding: word index to Glove word embedding

    embeddingLayer = Embedding(vocabLen, embDim, weights=[embeddingMatrix], trainable=isTrainable)
    return embeddingLayer


In [27]:
# usage
wordToIndex, indexToWord, wordToGlove = readGloveFile("glove/glove.6B.100d.txt")
pretrainedEmbeddingLayer = createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, False)
#model = Sequential()
#model.add(pretrainedEmbeddingLayer)

In [35]:
print(len(wordToIndex))
print(wordToIndex["well"])

400000
385218


In [2]:
# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

In [3]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

In [4]:
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)

[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]


In [5]:
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


In [6]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove/glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [7]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [8]:
print(vocab_size)
print(embedding_matrix.shape)
print(embedding_matrix)

15
(15, 100)
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.11619     0.45447001 -0.69216001 ... -0.54737002  0.48822001
   0.32246   ]
 [-0.2978      0.31147    -0.14937    ... -0.22709    -0.029261
   0.4585    ]
 ...
 [ 0.05869     0.40272999  0.38633999 ... -0.35973999  0.43718001
   0.10121   ]
 [ 0.15711001  0.65605998  0.0021149  ... -0.60614997  0.71004999
   0.41468999]
 [-0.047543    0.51914001  0.34283999 ... -0.26859     0.48664999
   0.55609   ]]


In [9]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [10]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
flatten_1 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 401       
Total params: 1,901
Trainable params: 401
Non-trainable params: 1,500
_________________________________________________________________
None


In [11]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x107947ed0>

In [12]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 100.000000


In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
flatten_1 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 401       
Total params: 1,901
Trainable params: 401
Non-trainable params: 1,500
_________________________________________________________________


In [14]:
# test prediction
test = array([[ 6,  2,  0,  0],
              [ 3,  1,  0,  0],
              [ 7,  4,  0,  0],
              [ 8,  1,  0,  0],
              [ 9,  0,  0,  0],
              [10,  0,  0,  0],
              [ 5,  4,  0,  0],
              [11,  3,  0,  0],
              [ 5,  1,  0,  0],
              [12, 13,  2, 14]])

y_pred = model.predict(test)
print(y_pred)
print(docs)
print(labels)

[[0.6227683 ]
 [0.70504725]
 [0.6970315 ]
 [0.72804064]
 [0.65190464]
 [0.39854982]
 [0.3523373 ]
 [0.3189102 ]
 [0.4870337 ]
 [0.01890703]]
['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!', 'Weak', 'Poor effort!', 'not good', 'poor work', 'Could have done better.']
[1 1 1 1 1 0 0 0 0 0]


In [15]:
# save model
model.save('model.h5')

In [16]:
# export coreml
import coremltools
coreml_model = coremltools.converters.keras.convert(model)
coreml_model.save('model.mlmodel')



0 : embedding_1_input, <keras.engine.topology.InputLayer object at 0x11bec3a50>
1 : embedding_1, <keras.layers.embeddings.Embedding object at 0x11bec3810>
2 : flatten_1, <keras.layers.core.Flatten object at 0x107973210>
3 : dense_1, <keras.layers.core.Dense object at 0x1065bb050>
4 : dense_1__activation__, <keras.layers.core.Activation object at 0x122361390>


In [17]:
# dump coreml
coreml_model

input {
  name: "input1"
  type {
    multiArrayType {
      shape: 1
      dataType: DOUBLE
    }
  }
}
output {
  name: "output1"
  type {
    multiArrayType {
      shape: 1
      dataType: DOUBLE
    }
  }
}

In [18]:
# Make predictions
predictions = coreml_model.predict({'input1': [[[6.0]],  [[2.0]],  [[0.0]],  [[0.0]]] })

In [19]:
print(predictions)

{u'output1': array([0.62255859])}
