# Import and load data file

In [None]:
import nltk
import spacy
import pickle
import json
import numpy as np
import random

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from keras.optimizers import SGD

In [None]:
words,classes,documents = [], [], []
ignore_words = ['?', '!']
with open('/content/intents.json') as data_file:
    intents = json.load(data_file)

# Preprocessing Data

In [None]:
for intent in intents['intents']:
  for pattern in intent['patterns']:
    #tokenize
    w = nltk.word_tokenize(pattern)
    words.extend(w)

    documents.append((w,intent['tag']))

    if intent['tag'] not in classes:
      classes.append(intent['tag'])

In [None]:
print(words[10:200])

['Hola', 'Hello', 'Good', 'day', 'Bye', 'See', 'you', 'later', 'Goodbye', 'Nice', 'chatting', 'to', 'you', ',', 'bye', 'Till', 'next', 'time', 'Thanks', 'Thank', 'you', 'That', "'s", 'helpful', 'Awesome', ',', 'thanks', 'Thanks', 'for', 'helping', 'me', 'How', 'you', 'could', 'help', 'me', '?', 'What', 'you', 'can', 'do', '?', 'What', 'help', 'you', 'provide', '?', 'How', 'you', 'can', 'be', 'helpful', '?', 'What', 'support', 'is', 'offered', 'How', 'to', 'check', 'Adverse', 'drug', 'reaction', '?', 'Open', 'adverse', 'drugs', 'module', 'Give', 'me', 'a', 'list', 'of', 'drugs', 'causing', 'adverse', 'behavior', 'List', 'all', 'drugs', 'suitable', 'for', 'patient', 'with', 'adverse', 'reaction', 'Which', 'drugs', 'dont', 'have', 'adverse', 'reaction', '?', 'Open', 'blood', 'pressure', 'module', 'Task', 'related', 'to', 'blood', 'pressure', 'Blood', 'pressure', 'data', 'entry', 'I', 'want', 'to', 'log', 'blood', 'pressure', 'results', 'Blood', 'pressure', 'data', 'management', 'I', 'want

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# lemmatize, lower each word
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]

In [None]:
print(words[10:50])

['hello', 'good', 'day', 'bye', 'see', 'you', 'later', 'goodbye', 'nice', 'chatting', 'to', 'you', ',', 'bye', 'till', 'next', 'time', 'thanks', 'thank', 'you', 'that', "'s", 'helpful', 'awesome', ',', 'thanks', 'thanks', 'for', 'helping', 'me', 'how', 'you', 'could', 'help', 'me', 'what', 'you', 'can', 'do', 'what']


In [None]:
words = sorted(list(set(words)))
# sort classes
classes = sorted(list(set(classes)))
# documents = combination between patterns and intents
print (len(documents), "documents")
# classes = intents
print (len(classes), "classes", classes)
# words = all words, vocabulary
print (len(words), "unique lemmatized words", words)
pickle.dump(words,open('words.pkl','wb'))
pickle.dump(classes,open('classes.pkl','wb'))

47 documents
9 classes ['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']
88 unique lemmatized words ["'s", ',', 'a', 'adverse', 'all', 'anyone', 'are', 'awesome', 'be', 'behavior', 'blood', 'by', 'bye', 'can', 'causing', 'chatting', 'check', 'could', 'data', 'day', 'detail', 'do', 'dont', 'drug', 'entry', 'find', 'for', 'give', 'good', 'goodbye', 'have', 'hello', 'help', 'helpful', 'helping', 'hey', 'hi', 'history', 'hola', 'hospital', 'how', 'i', 'id', 'is', 'later', 'list', 'load', 'locate', 'log', 'looking', 'lookup', 'management', 'me', 'module', 'nearby', 'next', 'nice', 'of', 'offered', 'open', 'patient', 'pharmacy', 'pressure', 'provide', 'reaction', 'related', 'result', 'search', 'searching', 'see', 'show', 'suitable', 'support', 'task', 'thank', 'thanks', 'that', 'there', 'till', 'time', 'to', 'transfer', 'up', 'want', 'what', 'which', 'with', 'you']


In [None]:
documents[0]

(['Hi', 'there'], 'greeting')

# Create training data

In [None]:
training = []
output_empty = [0]*len(classes)

In [None]:
# training set, bag of words for each sentence
for doc in documents:
  bag = []
  pattern_words = doc[0]

   # lemmatize each word - create base word, in attempt to represent related words
  pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]

  # create our bag of words array with 1, if word match found in current pattern
  for w in words:
    bag.append(1) if w in pattern_words else bag.append(0)

  # output is a '0' for each tag and '1' for current tag (for each pattern)
  output_row = list(output_empty)
  output_row[classes.index(doc[1])] = 1

  training.append([bag, output_row])

In [None]:
random.shuffle(training)

train_x = [item[0] for item in training]
train_y = [item[1] for item in training]

In [None]:
print(len(train_x), ',',len(train_x[0]))
print(len(train_y), ',',len(train_y[0]))

47 , 88
47 , 9


# Modelling

Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons equal to number of intents to predict output intent with softmax

In [None]:
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),) , activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model

In [None]:
sgd = SGD(learning_rate=0.01,momentum=0.9,nesterov=True, weight_decay=1e-6)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [None]:
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save('chatbot_model.h5', hist)

Epoch 1/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0752 - loss: 2.2918    
Epoch 2/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1696 - loss: 2.1512     
Epoch 3/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1469 - loss: 2.1132 
Epoch 4/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.3683 - loss: 2.0541 
Epoch 5/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2733 - loss: 1.9826 
Epoch 6/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4805 - loss: 1.8483 
Epoch 7/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.3997 - loss: 1.8965 
Epoch 8/200
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5408 - loss: 1.7234 
Epoch 9/200
[1m10/10[0m [32m━━

