In [1]:
import nltk
import os
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import numpy
import tensorflow as tf
import tflearn
import random
import json
import pickle
# https://techwithtim.net/tutorials/ai-chatbot/part-4/

curses is not supported on this machine (please install/reinstall curses for an optimal experience)


In [2]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\123\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\123\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

What we are doing with the JSON file is creating a bunch of messages that the user is likely to type in and mapping them to a group of appropriate responses. The tag on each dictionary in the file indicates the group that each message belongs too. With this data we will train a neural network to take a sentence of words and classify it as one of the tags in our file. Then we can simply take a response from those groups and display that to the user. The more tags, responses, and patterns you provide to the chatbot the better and more complex it will be.

For each pattern we will turn it into a list of words using nltk.word_tokenizer, rather than having them as strings. We will then add each pattern into our patterns list and its associated tag into the tags list.

In [3]:
stemmer = LancasterStemmer()
lemmatizer = WordNetLemmatizer()
with open("intents.json") as file:
    data = json.load(file)

try:
    with open("data.pickle", "rb") as f:
        words, labels, training, output = pickle.load(f)
except:    
    words = []
    labels = []
    patterns = [] # store words in each pattern
    tags = [] # store tags of each pattern
    ignore = ['?','!','.','$'] # ignore all of these symbols when processing the data as they are not needed
    for intent in data["intents"]:
        for pattern in intent["patterns"]:
            # take each word and tokenize it
            wrds = nltk.word_tokenize(pattern)
            words.extend(wrds)
            patterns.append(wrds)
            tags.append(intent["tag"])
            # adding labels to label list
        if intent["tag"] not in labels:
            labels.append(intent["tag"])

    words = [stemmer.stem(w.lower()) for w in words if w not in ignore] 
    # lemma gurantees root word is an actual word 
    # However this interfers with accuracy for whatever reason
    words = sorted(list(set(words))) # set to avoid repeatition of words

    labels = sorted(labels) # sorted alphabetically
    
"""
print(words)
print(labels)
print(patterns)
print(tags)
"""

'\nprint(words)\nprint(labels)\nprint(patterns)\nprint(tags)\n'

Now that we have loaded in our data and created a stemmed vocabulary it's time to talk about a bag of words. As we know neural networks and machine learning algorithms require numerical input. So out list of strings wont cut it. We need some way to represent our sentences with numbers and this is where a bag of words comes in. What we are going to do is represent each sentence with a list the length of the amount of words in our models vocabulary. Each position in the list will represent a word from our vocabulary. If the position in the list is a 1 then that will mean that the word exists in our sentence, if it is a 0 then the word is nor present. We call this a bag of words because the order in which the words appear in the sentence is lost, we only know the presence of words in our models vocabulary.

As well as formatting our input we need to format our output to make sense to the neural network. Similarly to a bag of words we will create output lists which are the length of the amount of labels/tags we have in our dataset. Each position in the list will represent one distinct label/tag, a 1 in any of those positions will show which label/tag is represented.

In [4]:
    training = []
    output = []

    out_empty = [0 for _ in range(len(labels))]

    for x, doc in enumerate(patterns):
        bag = [] # initializing bag of words 
        
         # lemmatize each word - create base word, in attempt to represent related words
        wrds = [stemmer.stem(w.lower()) for w in doc]
        
    # create our bag of words array with 1, if word match found in current pattern
        for w in words:
            if w in wrds:
                bag.append(1)
            else:
                bag.append(0)
                
        # output is a '0' for each tag and '1' for current tag (for each pattern)
        output_row = out_empty[:]
        output_row[labels.index(tags[x])] = 1

        training.append(bag)
        output.append(output_row)


    training = numpy.array(training)# convert training and output into numpy arrays
    output = numpy.array(output)
    #print(output)

In [5]:
with open("data.pickle", "wb") as f:
        pickle.dump((words, labels, training, output), f)

tf.reset_default_graph()


In [6]:
net = tflearn.input_data(shape=[None, len(training[0])])
net = tflearn.fully_connected(net, 128)
net = tflearn.fully_connected(net, 64)
net = tflearn.fully_connected(net, len(output[0]), activation="softmax")
net = tflearn.regression(net)
# first 2 layers have 8 nuerons, output layer is equal to number of tags
model = tflearn.DNN(net)

if os.path.exists("model" + ".meta"):
    model.load("model.tflearn")
else:
    model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)
    model.save("model.tflearn")

Training Step: 3999  | total loss: [1m[32m0.10635[0m[0m | time: 0.016s
| Adam | epoch: 1000 | loss: 0.10635 - acc: 0.9891 -- iter: 24/31
Training Step: 4000  | total loss: [1m[32m0.09712[0m[0m | time: 0.021s
| Adam | epoch: 1000 | loss: 0.09712 - acc: 0.9902 -- iter: 31/31
--
INFO:tensorflow:C:\Users\123\Documents\Data Analytics\projects\model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


In [7]:
def bag_of_words(s, words): # creates bag of words out of user's input to pass into model
    bag = [0 for _ in range(len(words))]

    s_words = nltk.word_tokenize(s)
    s_words = [stemmer.stem(word.lower()) for word in s_words]

    for se in s_words:
        for i, w in enumerate(words):
            if w == se:
                bag[i] = 1
            
    return numpy.array(bag)

In [8]:
def chatbot():
    print("Start talking with my bot (type quit to stop)!")
    name = "Bot: "
    while True:
        inp = input("You: ")
        if inp.lower() == "quit":
            break

        results = model.predict([bag_of_words(inp, words)])
        results_index = numpy.argmax(results)
        tag = labels[results_index]

        for t in data["intents"]:
            if t['tag'] == tag:
                responses = t['responses']
            if tag == "name":
                name = "Fiona: "
        print(name,end="")
        print(random.choice(responses))

chatbot()

Start talking with my bot (type quit to stop)!
You: hi
Bot: Hello, thanks for visiting
You: thanks
Bot: Any time!
You: bye
Bot: Have a nice day
You: name
Fiona: My name is Fiona. Why? Because my creator thought it sounded nice.
You: hobbies
Fiona: My hobbies include playing video games, fitness (gym, biking etc.), basketball and watching sitcoms such as The Office, Brooklyn-99 and How I met your mother.
You: data science
Fiona: It started back in first year at a Loblaw networking session. That is when I was first introduced to data science. They described it as a puzzle and the more pieces you fit the clearer the picture got, and you would gain insight from different perspectives. Recently however, I have been wanting to better my software development skills as I want to be more well-rounded than only having 1 specialization/concentration.
You: peers
Fiona: He is a take charge, self managed leader. He is very hard working and will put the time in. Always setting strict deadlines to pus