# Imports

In [None]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Bidirectional, LSTM, GRU, TimeDistributed, Activation, Flatten, Embedding
from keras.optimizers import Adam



# Functions

In [None]:
# Prepare Glove File
def readGloveFile(gloveFile):
    with open(gloveFile, 'r') as f:
        wordToGlove = {}  # map from a token (word) to a Glove embedding vector
        wordToIndex = {}  # map from a token to an index
        indexToWord = {}  # map from an index to a token 

        for line in f:
            record = line.strip().split()
            token = record[0] # take the token (word) from the text line
            wordToGlove[token] = np.array(record[1:], dtype=np.float64) # associate the Glove embedding vector to a that token (word)

        tokens = sorted(wordToGlove.keys())
        for idx, tok in enumerate(tokens):
            kerasIdx = idx + 1  # 0 is reserved for masking in Keras (see above)
            wordToIndex[tok] = kerasIdx # associate an index to a token (word)
            indexToWord[kerasIdx] = tok # associate a word to a token (word). Note: inverse of dictionary above

    return wordToIndex, indexToWord, wordToGlove

In [None]:
# Create Pretrained Keras Embedding Layer
def createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, inputLength, isTrainable):
    vocabLen = len(wordToIndex) + 1  # adding 1 to account for masking
    embDim = next(iter(wordToGlove.values())).shape[0]  # works with any glove dimensions (e.g. 50)

    embeddingMatrix = np.zeros((vocabLen, embDim))  # initialize with zeros
    for word, index in wordToIndex.items():
        embeddingMatrix[index, :] = wordToGlove[word] # create embedding: word index to Glove word embedding

    embeddingLayer = Embedding(vocabLen, embDim, weights=[embeddingMatrix], input_length=inputLength, trainable=isTrainable)
    return embeddingLayer, embDim

In [None]:
# Embedding
def getEncodedDocs(docs):
    encoded_docs = []

    tokenizer = RegexpTokenizer(r'\w+')
    for doc in docs:
        encoded_doc = []
        for word in tokenizer.tokenize(doc.lower()):
            index = wordToIndex[word]
            if index is not None:
                encoded_doc.append(index)
            else:
                encoded_doc.append(0)
        encoded_docs.append(encoded_doc)

    return encoded_docs

# Import GloVe Pretrained dataset and create Embedding Layer

In [None]:
max_length = 20
wordToIndex, indexToWord, wordToGlove = readGloveFile("glove/glove.6B.100d.txt")
pretrainedEmbeddingLayer, embDim = createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, max_length, False)

# Import intents file

In [None]:
import json
with open('PharmacyDataset.json') as json_data:
    intents = json.load(json_data)

# Padding, Encoding and Preparing final X and Y data for Training

In [None]:
classes = []
encodedUtterances = []

# loop through each sentence in our intents utterances
for intent in intents['intents']:
    classes.append(intent['intent'])
    encoded_docs = getEncodedDocs(intent['utterances'])
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    encodedUtterances.append(padded_docs)


In [None]:
currentClass = 0
train_x = []
train_y = []

for intent in classes:
    y = [0] * len(classes)
    y[currentClass] = 1

    for vector in encodedUtterances[currentClass]:
        train_x.append(vector)
        train_y.append(y)

    currentClass += 1

vectorSize = len(train_x[0])

train_X = np.array(train_x)
train_Y = np.array(train_y)

print(classes, "classes")
print(vectorSize, "vector size")
print(len(train_x), len(train_x[0]), "x")
print(len(train_y), len(train_y[0]), "y")
print(train_X.shape)
print(train_Y.shape)

# Prepare and Compile Keras / TensorFlow model

In [None]:
#sequence_input = Input(shape=(None, len(train_x[0])), dtype='float')
#bidiGru = Bidirectional(GRU(100))(sequence_input)
#preds = Dense(len(train_y[0]), activation='softmax')(bidiGru)
#model = Model(sequence_input, preds)
#model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])


# NB using LSTM as CoreML only support LSTM in Bidirectional layer


model = Sequential([
                    pretrainedEmbeddingLayer,
                    #GRU(embDim, batch_size=1, input_shape=(None, embDim), return_sequences=True),
                    #Bidirectional(GRU(embDim, batch_size=1, input_shape=(None, embDim), return_sequences=True)),
                    Bidirectional(LSTM(embDim, batch_size=1, input_shape=(None, embDim), return_sequences=True)),
                    TimeDistributed(Dense(64)),
                    Activation('relu'),
                    TimeDistributed(Dense(32)),
                    Activation('relu'),
                    Flatten(),
                    Dense(len(train_y[0]), activation='softmax')
                   ])

print("model fitting - Bidirectional GRU")
model.summary()


In [None]:
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model

In [None]:
model.fit(train_X, train_Y, epochs=50)

# Save Model

In [None]:
model.save('SwiftNLCGloveRNN.h5')

# Test Model

In [None]:
# evaluate the model
loss, accuracy = model.evaluate(train_X, train_Y, verbose=0)
print('Loss: %f ' % (loss*100))
print('Accuracy: %f ' % (accuracy*100))

In [None]:
print(intents)

#test = np.array([[178126, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
test_docs = getEncodedDocs(["Display blood values for patient"])
padded_test_docs = pad_sequences(test_docs, maxlen=max_length, padding='post')

print(padded_test_docs)

y_pred = model.predict(padded_test_docs)
print(y_pred)

max_value = max(y_pred[0])
print(max_value)

max_index = y_pred[0].tolist().index(max_value)
print(max_index)

print(intents['intents'][max_index]['intent'])

# Export Word Embedding Array

In [None]:
for i in range(0,10):
    print(wordToIndex.keys()[i], wordToIndex[wordToIndex.keys()[i]], i)
    
import json
with open('Words.json', 'w') as fp:
    json.dump(wordToIndex, fp)


# Export Model using CoreML Tools

In [None]:
import coremltools
coreml_model = coremltools.converters.keras.convert(model, input_names="vectors", output_names="entities")
coreml_model

In [None]:
coreml_model.save('SwiftNLCGloveRNN.mlmodel')