In [131]:
from numpy import array
from numpy import asarray
from numpy import zeros
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

In [132]:
# Prepare Glove File
def readGloveFile(gloveFile):
    with open(gloveFile, 'r') as f:
        wordToGlove = {}  # map from a token (word) to a Glove embedding vector
        wordToIndex = {}  # map from a token to an index
        indexToWord = {}  # map from an index to a token 

        for line in f:
            record = line.strip().split()
            token = record[0] # take the token (word) from the text line
            wordToGlove[token] = np.array(record[1:], dtype=np.float64) # associate the Glove embedding vector to a that token (word)

        tokens = sorted(wordToGlove.keys())
        for idx, tok in enumerate(tokens):
            kerasIdx = idx + 1  # 0 is reserved for masking in Keras (see above)
            wordToIndex[tok] = kerasIdx # associate an index to a token (word)
            indexToWord[kerasIdx] = tok # associate a word to a token (word). Note: inverse of dictionary above

    return wordToIndex, indexToWord, wordToGlove

In [133]:
# Create Pretrained Keras Embedding Layer
def createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, inputLength, isTrainable):
    vocabLen = len(wordToIndex) + 1  # adding 1 to account for masking
    embDim = next(iter(wordToGlove.values())).shape[0]  # works with any glove dimensions (e.g. 50)

    print(embDim)
    
    embeddingMatrix = np.zeros((vocabLen, embDim))  # initialize with zeros
    for word, index in wordToIndex.items():
        embeddingMatrix[index, :] = wordToGlove[word] # create embedding: word index to Glove word embedding

    embeddingLayer = Embedding(vocabLen, embDim, weights=[embeddingMatrix], input_length=inputLength, trainable=isTrainable)
    return embeddingLayer


In [134]:
# Embedding
def getEncodedDocs(docs):
    encoded_docs = []

    tokenizer = RegexpTokenizer(r'\w+')
    for doc in docs:
        encoded_doc = []
        for word in tokenizer.tokenize(doc.lower()):
            index = wordToIndex[word]
            if index is not None:
                encoded_doc.append(index)
            else:
                encoded_doc.append(0)
        encoded_docs.append(encoded_doc)

    return encoded_docs

In [135]:
# usage
max_length = 4
wordToIndex, indexToWord, wordToGlove = readGloveFile("glove/glove.6B.100d.txt")
pretrainedEmbeddingLayer = createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, max_length, False)
#model = Sequential()
#model.add(pretrainedEmbeddingLayer)

100


In [136]:
print(len(wordToIndex))
print(wordToIndex["well"])

400000
385218


In [137]:
# define documents
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']
# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

In [138]:
encoded_docs = getEncodedDocs(docs)    
print(encoded_docs)

[[385218, 127491], [164328, 389836], [166369, 133946], [260760, 389836], [142331], [384383], [288743, 133946], [264550, 164328], [288743, 389836], [110156, 174642, 127491, 74597]]


In [139]:
# pad documents to a max length of 4 words
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[385218 127491      0      0]
 [164328 389836      0      0]
 [166369 133946      0      0]
 [260760 389836      0      0]
 [142331      0      0      0]
 [384383      0      0      0]
 [288743 133946      0      0]
 [264550 164328      0      0]
 [288743 389836      0      0]
 [110156 174642 127491  74597]]


In [140]:
# define model
model = Sequential()
#e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
#model.add(e)
model.add(pretrainedEmbeddingLayer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [141]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 4, 100)            40000100  
_________________________________________________________________
flatten_7 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 401       
Total params: 40,000,501
Trainable params: 401
Non-trainable params: 40,000,100
_________________________________________________________________
None


In [142]:
# fit the model
model.fit(padded_docs, labels, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x122f92710>

In [143]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 100.000000


In [144]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 4, 100)            40000100  
_________________________________________________________________
flatten_7 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 401       
Total params: 40,000,501
Trainable params: 401
Non-trainable params: 40,000,100
_________________________________________________________________


In [146]:
# test prediction

test = array([[385218, 127491,      0,      0],
              [164328, 389836,      0,      0],
              [166369, 133946,      0,      0],
              [260760, 389836,      0,      0],
              [142331,      0,      0,      0],
              [384383,      0,      0,      0],
              [288743, 133946,      0,      0],
              [264550, 164328,      0,      0],
              [288743, 389836,      0,      0],
              [110156, 174642, 127491,  74597]])


y_pred = model.predict(test)
print(y_pred)
print(docs)
print(labels)

[[0.76810914]
 [0.80124456]
 [0.8380265 ]
 [0.8395677 ]
 [0.7456014 ]
 [0.23684552]
 [0.19560532]
 [0.23353581]
 [0.27315626]
 [0.0324432 ]]
['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!', 'Weak', 'Poor effort!', 'not good', 'poor work', 'Could have done better.']
[1 1 1 1 1 0 0 0 0 0]


In [147]:
# save model
model.save('model.h5')

In [148]:
# export coreml
import coremltools
coreml_model = coremltools.converters.keras.convert(model)
coreml_model.save('model.mlmodel')

0 : embedding_7_input, <keras.engine.topology.InputLayer object at 0x1218c82d0>
1 : embedding_7, <keras.layers.embeddings.Embedding object at 0x121c317d0>
2 : flatten_7, <keras.layers.core.Flatten object at 0x1218c8b50>
3 : dense_6, <keras.layers.core.Dense object at 0x1218c8b90>
4 : dense_6__activation__, <keras.layers.core.Activation object at 0x124115f90>


In [149]:
# dump coreml
coreml_model

input {
  name: "input1"
  type {
    multiArrayType {
      shape: 1
      dataType: DOUBLE
    }
  }
}
output {
  name: "output1"
  type {
    multiArrayType {
      shape: 1
      dataType: DOUBLE
    }
  }
}

In [150]:
# Make predictions
predictions = coreml_model.predict({'input1': [[[385218.0]],  [[127491.0]],  [[0.0]],  [[0.0]]] })

In [151]:
print(predictions)

{u'output1': array([0.76757812])}
