In [None]:
# We'll first need the data from my drive.
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Outline

I'm going to attempt to build a simple chatbot a a simple set of intents and see how I can improve it after. 

My thought is to create a list of global variables, which get filled as the person says things. I can have the machine pick out Named entities and ask questions about information it doesn't have, as it gets information.

Latest Idea: write out a bunch of responses that the chatbot can have 
based on what the user writes. Presently, I'm thinking that the responses should be based on what information the user provides. For example, say the user provides in their sentence a place they want to go and their budget. The bot would then ask, "Okay, so you would like to go to {{this place}} and have a budget of {{this much}}. Where will you be travelling from and what date would you like to leave?" 

## Pseudocode

May be updated periodically

* import packages needed
* import data
* import intents
* tokenize data
* give a label to words using NER 
* train to recognize what information is contained in the sentence to give the correct response. 
  * This also entails recognizing whether the city stated is the origin or destination city
  * This should be able to handle budgets of all types, even no budget
  * We also need to be able to handle a variation of dates, and pick out the start and end dates
* Hard code some responses to the various permutations of data that is collected for each user question/statement.
  * Greeting
  * Greeting + Statement/Question relating to booking
  * Greeting + general question/statement
  * General question/statement
  * Statement/Question relating to booking
  * confirmations/rejections
  * conversation terminations
* The conversation should continue until the termination step


# Simple Chatbot

## Get data, and train the model

In [None]:
# We'll need the following nltk packages
import nltk
nltk.download('punkt')
nltk.download('wordnet')

# We'll use lemmas instead of stems.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# We'll need json to open a file of intents
import json

# We will need to save the model
import pickle

# Some helper libraries
import numpy as np
import random

# We'll need these to actually create a machine learning model.
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# this list will be for a bunch of tokenized words
words=[]
# this list is to contain the list of tags contained in the intents. This means the high level characterization of any interactions
classes = []
# This will be a list of tuples which will classify words with their tags.
documents = []
# pieces of speech and text to ignore
ignore_words = ['?', '!', ',',"'s"]
# This is our initial intents file
data_file = open('/content/drive/My Drive/1000ml/Project 7 - Chatbot/Data/intents.json').read()
intents = json.loads(data_file)

In [None]:
# We will loop through the intents in the json file
for intent in intents['intents']:
    # Now loop through the actual possible text patterns 
    for pattern in intent['patterns']:

        # take each word and tokenize it
        w = nltk.word_tokenize(pattern)
        words.extend(w)
        # adding documents
        documents.append((w, intent['tag']))

        # adding classes to our class list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [None]:
# Lemmatize all the words that aren't in our list of things to ignore
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
# sort the words alphabetically
words = sorted(list(set(words)))
# sort classes alphabetically
classes = sorted(list(set(classes)))

# print out for QA
print (len(documents), "documents")

print (len(classes), "classes", classes)

print (len(words), "unique lemmatized words", words)

# pickle some things to save them for later
pickle.dump(words,open('words.pkl','wb'))
pickle.dump(classes,open('classes.pkl','wb'))

37 documents
7 classes ['budget', 'date', 'goodbye', 'greeting', 'options', 'thanks', 'travel_plans']
57 unique lemmatized words ['a', 'anyone', 'are', 'awesome', 'be', 'book', 'budget', 'bye', 'can', 'chatting', 'could', 'date', 'day', 'do', 'for', 'good', 'goodbye', 'have', 'hello', 'help', 'helpful', 'helping', 'hey', 'hi', 'hola', 'how', 'i', 'is', 'later', 'like', 'me', 'money', 'next', 'nice', 'of', 'offered', 'on', 'only', 'plane', 'provide', 'see', 'somewhere', 'spend', 'support', 'thank', 'thanks', 'that', 'there', 'this', 'till', 'time', 'to', 'travel', 'trip', 'want', 'what', 'you']


In [None]:
# In this cell we are creating our training data. It is basically creating a word vector for each word, and corresponding class vector
# initializing training data
training = []
output_empty = [0] * len(classes)
for doc in documents:
    # initializing bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # lemmatize each word - create base word, in attempt to represent related words
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    # create our bag of words array with 1, if word match found in current pattern
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

In [None]:
# shuffle our features and turn into np.array. This appears as though it would make learning a little less biased.
random.shuffle(training)
training = np.array(training)
# create train and test lists. X - patterns, Y - intents
train_x = list(training[:,0])
train_y = list(training[:,1])
print("Training data created")

Training data created


In [None]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

In [None]:
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

#fitting and saving the model
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save('chatbot_model.h5', hist)

print("model created")

## Reload the saved model and try out the chatbot

In [None]:
from keras.models import load_model
model = load_model('chatbot_model.h5')
import json
import random
intents = json.loads(open('/content/drive/My Drive/1000ml/Project 7 - Chatbot/Data/intents.json').read())
words = pickle.load(open('words.pkl','rb'))
classes = pickle.load(open('classes.pkl','rb'))

In [None]:
def clean_up_sentence(sentence):
    '''This function takes in a sentence, splits it into words, lemmatizes them and returns the list of words.'''
    sentence_words = nltk.word_tokenize(sentence)
    sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=True):
    '''This function takes in a sentence, a bunch of words and returns a bag of words for words present in that sentence.'''
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words - matrix of N words, vocabulary matrix
    bag = [0]*len(words)
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s:
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)
    return(np.array(bag))

def predict_class(sentence, model):
    ''''''
    # filter out predictions below a threshold
    p = bow(sentence, words,show_details=False)
    res = model.predict(np.array([p]))[0]
    ERROR_THRESHOLD = 0.25
    results = [[i,r] for i,r in enumerate(res) if r>ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
    return return_list

def getResponse(ints, intents_json):
    tag = ints[0]['intent']
    list_of_intents = intents_json['intents']
    for i in list_of_intents:
        if(i['tag']== tag):
            result = random.choice(i['responses'])
            break
    return result

def chatbot_response(msg):
    ints = predict_class(msg, model)
    res = getResponse(ints, intents)
    return res

In [None]:
chatbot_response('I can only spend 1800')

"Okay, I'll see what I can find."

## The GUI

In [None]:
#Creating GUI with tkinter
import tkinter
from tkinter import *


def send():
    msg = EntryBox.get("1.0",'end-1c').strip()
    EntryBox.delete("0.0",END)

    if msg != '':
        ChatLog.config(state=NORMAL)
        ChatLog.insert(END, "You: " + msg + '\n\n')
        ChatLog.config(foreground="#442265", font=("Verdana", 12 ))

        res = chatbot_response(msg)
        ChatLog.insert(END, "Bot: " + res + '\n\n')

        ChatLog.config(state=DISABLED)
        ChatLog.yview(END)


base = Tk()
base.title("Hello")
base.geometry("400x500")
base.resizable(width=FALSE, height=FALSE)

#Create Chat window
ChatLog = Text(base, bd=0, bg="white", height="8", width="50", font="Arial",)

ChatLog.config(state=DISABLED)

#Bind scrollbar to Chat window
scrollbar = Scrollbar(base, command=ChatLog.yview, cursor="heart")
ChatLog['yscrollcommand'] = scrollbar.set

#Create Button to send message
SendButton = Button(base, font=("Verdana",12,'bold'), text="Send", width="12", height=5,
                    bd=0, bg="#32de97", activebackground="#3c9d9b",fg='#ffffff',
                    command= send )

#Create the box to enter message
EntryBox = Text(base, bd=0, bg="white",width="29", height="5", font="Arial")
#EntryBox.bind("<Return>", send)


#Place all components on the screen
scrollbar.place(x=376,y=6, height=386)
ChatLog.place(x=6,y=6, height=386, width=370)
EntryBox.place(x=128, y=401, height=90, width=265)
SendButton.place(x=6, y=401, height=90)

base.mainloop()

## Try a bigger list of intents

In [None]:
from pprint import pprint

In [None]:
data_file = open('/content/drive/My Drive/1000ml/Project 7 - Chatbot/Data/frames.json').read()
intents = json.loads(data_file)

In [None]:
intents[1]['turns'][1]['text']

"Hi. Sorry, I can't find any trips from Gotham City to Mos Eisley for you."

# Better Chatbot

For this, we'll take the above chatbot and expand a little bit. I'm going to see what happens if the tags that I give my words are generated from the SpaCy library, rather than hardcoding them. We will use the list of interactions between users and computer in the frames dataset. 

In [None]:
# We'll need spacy for this, and some packages within
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

# We'll need the following nltk packages
import nltk
nltk.download('punkt')
nltk.download('wordnet')

# We'll use lemmas instead of stems.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# We'll need to get rid of punctuation for a cleaner set of words
import re

# We'll need json to open a file of intents
import json

# We will need to save the model
import pickle

# Some helper libraries
import numpy as np
import random

# We'll need these to actually create a machine learning model.
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD

# For timing some loops
import time

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# This is how we will process the data.
nlp = en_core_web_sm.load()
# this list will be for a bunch of tokenized words
words = []
# this list is to contain the list of tags contained in the intents. This means the high level characterization of any interactions
classes = []
# This will be a list of tuples which will classify words with their tags, which we will make using SpaCy.
documents = []
# pieces of speech and text to ignore
ignore_words = ['?', '!']
# This is our initial intents file
data_file = open('/content/drive/My Drive/1000ml/Project 7 - Chatbot/Data/frames.json').read()
intents = json.loads(data_file)

In [None]:
# The intents are defined, but the file is huge and contains a bunch of precategorized data. For ease and to avoid potential bias, let's organize the json file into
# a dictionary of just the conversation, and the replies. 
# To do this, we only want to take what the user says, and follow it up with a response.
new_intents = []
starttime = time.time()
# First, we'll loop through each conversation
for intent in intents:
  # Then within each conversation, we'll need to loop through the turns
  for t in range(0,len(intent['turns'])-1,2):
    new_intents.append({
                        'user':re.sub(r'[^a-zA-Z0-9\s]', ' ', intent['turns'][t]['text']),
                        'response':re.sub(r'[^a-zA-Z0-9\s]', ' ', intent['turns'][t+1]['text'])
                        })
# Just testing how long this takes. 
endtime= time.time()
print(f'{endtime-starttime} seconds   OR   {(endtime-starttime)/60} minutes')

0.059102773666381836 seconds   OR   0.0009850462277730305 minutes


In [None]:
new_intents[1]

{'response': 'I checked the availability for this date and there were no trips available   Would you like to select some alternate dates ',
 'user': 'Yes  how about going to Neverland from Caprica on August 13  2016 for 5 adults  For this trip  my budget would be 1900 '}

The next step here is to take out all the user questions and break them down into words, and give them each a tag as per PoS tags with SpaCy.

In [None]:
# For this, we will treat every back and forth as a question and response. The user is always considered the question, while the wizard or computer is considered the response
# We will loop through the intents in the json file
for intent in new_intents:
    # Now we want to pull out the user half of the conversation
    pattern_user = intent['user']
    # Then the response half of the conversation
    pattern_response = intent['response']
    # take each word and tokenize it
    w_u = nltk.word_tokenize(pattern_user)
    w_r = nltk.word_tokenize(pattern_response)
    words.extend(w_u)
    words.extend(w_r)
    # adding documents
    documents.append((w_u, 'question'))
    documents.append((w_r, 'response'))

# adding classes to our class list
classes.append('question')
classes.append('response')

In [None]:
# Lemmatize all the words that aren't in our list of things to ignore
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
# sort the words alphabetically
words = sorted(list(set(words)))
# sort classes alphabetically
classes = sorted(list(set(classes)))

# print out for QA
print (len(documents), "documents")

print (len(classes), "classes", classes)

print (len(words), "unique lemmatized words", words)

# pickle some things to save them for later
pickle.dump(words,open('words.pkl','wb'))
pickle.dump(classes,open('classes.pkl','wb'))

19158 documents
2 classes ['question', 'response']
6361 unique lemmatized words ['0', '00', '000', '0042', '00a', '00am', '00p', '00pm', '01am', '01pm', '02', '02pm', '03pm', '04', '05', '06', '07', '09', '0usd', '1', '10', '100', '1000', '10000', '1001', '1002', '1003', '10036', '10042', '1006', '1007', '1008', '10081', '1012', '1013', '1018', '10199', '10254', '10269', '10300', '10317', '1032', '1035', '1037', '10377', '10392', '10400', '1041', '1044', '1047', '10474', '1048', '1049', '1050', '1051', '1052', '10540', '10546', '1056', '1057', '1058', '10586', '1059', '1061', '1062', '1065', '10656', '1066', '10688', '1069', '1071', '1072', '10737', '10747', '1075', '10757', '1078', '10800', '1081', '1082', '1085', '1086', '1087', '1088', '10887', '10898', '10905', '1091', '10932', '1095', '10956', '1096', '1098', '10986', '10th', '10usd', '11', '110', '1100', '1103', '1105', '1106', '1108', '1109', '11095', '1112', '1113', '11161', '11168', '11185', '1119', '11190', '1123', '1126', '1

In [None]:
# In this cell we are creating our training data. It is basically creating a word vector for each word, and corresponding class vector
# initializing training data
training = []
output_empty = [0] * len(classes)
for doc in documents:
    # initializing bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # lemmatize each word - create base word, in attempt to represent related words
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
    # create our bag of words array with 1, if word match found in current pattern
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

In [None]:
# shuffle our features and turn into np.array. This appears as though it would make learning a little less biased.
random.shuffle(training)
training = np.array(training)
# create train and test lists. X - patterns, Y - intents
train_x = list(training[:,0])
train_y = list(training[:,1])
print("Training data created")

Training data created


In [None]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

In [None]:
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

#fitting and saving the model
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save('chatbot_model.h5', hist)

print("model created")

In [None]:
from keras.models import load_model
model = load_model('chatbot_model.h5')
import json
import random
intents = json.loads(open('/content/drive/My Drive/1000ml/Project 7 - Chatbot/Data/frames.json').read())
words = pickle.load(open('words.pkl','rb'))
classes = pickle.load(open('classes.pkl','rb'))

In [None]:
def clean_up_sentence(sentence):
    '''This function takes in a sentence, splits it into words, lemmatizes them and returns the list of words.'''
    sentence_words = nltk.word_tokenize(sentence)
    sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=True):
    '''This function takes in a sentence, a bunch of words and returns a bag of words for words present in that sentence.'''
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words - matrix of N words, vocabulary matrix
    bag = [0]*len(words)
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s:
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)
    return(np.array(bag))

def predict_class(sentence, model):
    '''Based on the previous classes, this function attempts to predict the class of response based on the words in the sentence'''
    # filter out predictions below a threshold
    p = bow(sentence, words,show_details=False)
    res = model.predict(np.array([p]))[0]
    ERROR_THRESHOLD = 0.25
    results = [[i,r] for i,r in enumerate(res) if r>ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
    return return_list

def getResponse(ints, intents_json):
    tag = ints[0]['intent']
    list_of_intents = intents_json['intents']
    for i in list_of_intents:
        if(i['tag']== tag):
            result = random.choice(i['responses'])
            break
    return result

def chatbot_response(msg):
    ints = predict_class(msg, model)
    res = getResponse(ints, intents)
    return res

# Stuff I dont need right now

In [None]:
json_df['turns'].iloc[0]

In [None]:
# These are dialogues from a Ubuntu users 
df1=pd.read_csv('/content/drive/My Drive/1000ml/Project 7 - Chatbot/Data/dialogueText.csv')
df2=pd.read_csv('/content/drive/My Drive/1000ml/Project 7 - Chatbot/Data/dialogueText_196.csv')
df3=pd.read_csv('/content/drive/My Drive/1000ml/Project 7 - Chatbot/Data/dialogueText_301.csv')

In [None]:
df1.head(2)

Unnamed: 0,folder,dialogueID,date,from,to,text
0,3,126125.tsv,2008-04-23T14:55:00.000Z,bad_image,,"Hello folks, please help me a bit with the fol..."
1,3,126125.tsv,2008-04-23T14:56:00.000Z,bad_image,,Did I choose a bad channel? I ask because you ...


In [None]:
df1['text'].astype(str)

0          Hello folks, please help me a bit with the fol...
1          Did I choose a bad channel? I ask because you ...
2          the second sentence is better english   and we...
3                                               Sock Puppe?t
4                                                       WTF?
                                 ...                        
1038319                                           anyone on?
1038320                                                  yes
1038321    can I get a pastebin of someones menu.lst with...
1038322                         http://pastebin.com/fe921690
1038323                                               thanks
Name: text, Length: 1038324, dtype: object

In [None]:
# With these, I'll want to extract the text from each individual conversation. I should be able to do this with a simple group by statement
df1['text'] = df1['text'].astype(str)
df1.groupby('dialogueID')['text'].agg(lambda x: ''.join(x))

dialogueID
1.tsv        Also guys, I'm trying to get into my FIrefox p...
10.tsv       ugh ;(  http://planet.ubuntulinux.org seems to...
100.tsv      ohh to latehttp://www.ubuntulinux.org/ubuntu/l...
1000.tsv     see bug 67085sorry, typo, ignore that... try t...
10000.tsv    How do I get out of this annoying unity? I can...
                                   ...                        
99995.tsv    What's locale got to do with 24 hour format?24...
99996.tsv    hi all.. i am using rythmbox,  is there aw ay ...
99997.tsv    hi i want to know how to i use the Huawei E220...
99998.tsv    umm !res is way out of date or something? dpkg...
99999.tsv    Does anyone know a good linux distribution ?de...
Name: text, Length: 346108, dtype: object