In [1]:
# We'll first need the data from my drive.
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Outline

Latest Idea: write out a bunch of responses that the chatbot can have 
based on what the user writes. Presently, I'm thinking that the responses should be based on what information the user provides. For example, say the user provides in their sentence a place they want to go and their budget. The bot would then ask, "Okay, so you would like to go to {{this place}} and have a budget of {{this much}}. Where will you be travelling from and what date would you like to leave?" 

## Pseudocode

* import packages needed
* import data
* import intents
* tokenize data
* give a label to words using NER 
* train to recognize what information is contained in the sentence to give the correct response. 
  * This also entails recognizing whether the city stated is the origin or destination city
  * This should be able to handle budgets of all types, even no budget
  * We also need to be able to handle a variation of dates, and pick out the start and end dates
* Hard code some responses to the various permutations of data that is collected for each user question/statement.
  * Greeting
  * Greeting + Statement/Question relating to booking
  * Greeting + general question/statement
  * General question/statement
  * Statement/Question relating to booking
  * confirmations/rejections
  * conversation terminations
* The conversation should continue until the termination step

## Packages

In [42]:
path='/content/drive/My Drive/1000ml/Project 7 - Chatbot/Data/'

# We'll need the following nltk packages
import nltk
nltk.download('punkt')
nltk.download('wordnet')

# We'll use lemmas instead of stems.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# We will tag the parts of the speech 
from nltk.tag import pos_tag

# Get the stop words
from nltk.corpus import stopwords
nltk.download('stopwords')

# We'll need json to open a file of intents
import json

# We will need to save the model
import pickle

# Some helper libraries
import numpy as np
import pandas as pd
import random
import re

# We'll need these to actually create a machine learning model.
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD, Adagrad, Adam, Adadelta

# We may need to try out a grid search
from sklearn.model_selection import GridSearchCV

# We'll need this for the in-code tagging of words
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# The Chatbot

In [43]:
# this list will be for a bunch of tokenized words, it will come from all the data I have available. 
# These will help my model train for many instances of conversation to make sure to pull out the right data
words=[]
# this list will be the list of entities or classes that each word is defined by. 
classes = []
# This will be a list of tuples which will classify words with their tags.
documents = []

# We'll need a list of stop words, words that aren't needed, to filter out. 
# Note that we will exclude stop words 'to' and 'from' as they are important identifiers for origin/destination tagging
stop_words = stopwords.words('english')
stop_words.remove('from')
stop_words.remove('to')

# The file below is a travel related with questions and responses
data_file1 = open(f'{path}frames.json').read()
travel_txt = json.loads(data_file1)

# This batch of files contains other travel data, questions, no answers
for i in range(1,9):
  for j in range(1,5):
    df_temp=pd.read_csv(f'{path}{i}_{j}_align.csv')
    if (j==1&i==1):
      QA_df = df_temp
    else:
      QA_df=pd.concat([QA_df, df_temp], ignore_index=True)

# These files contain a whole bunch of conversation type text, which are from internet forums
# data_file2 = pd.read_csv(f'{path}dialogueText.csv')
# data_file3 = pd.read_csv(f'{path}dialogueText_196.csv')
# data_file4 = pd.read_csv(f'{path}dialogueText_301.csv')

# data_file2['text'] = data_file2['text'].astype(str)
# data_file2.groupby('dialogueID')['text'].agg(lambda x: ''.join(x)).reset_index().drop(columns=['dialogueID'])
# data_file3['text'] = data_file2['text'].astype(str)
# data_file3.groupby('dialogueID')['text'].agg(lambda x: ''.join(x)).reset_index().drop(columns=['dialogueID'])
# data_file4['text'] = data_file2['text'].astype(str)
# data_file4.groupby('dialogueID')['text'].agg(lambda x: ''.join(x)).reset_index().drop(columns=['dialogueID'])

In [None]:
# Let's organize the json file into a list of conversations
text_list = []
# First, we'll loop through each conversation
for convo in travel_txt:
  for t in range(len(convo['turns'])):
    text_list.append(re.sub(r'[^a-zA-Z0-9\s]', ' ', convo['turns'][t]['text']))

In [None]:
# Next, let's do the same thing with the data frame
QA_df = QA_df.dropna(subset=['Text'])
text_list.extend(list(QA_df.Text.values))
# Get rid of all the non-alpha-numeric characters
for t in range(len(text_list)):
  text_list[t] = re.sub(r'[^a-zA-Z0-9\s]', ' ', text_list[t])

In [None]:
# Let's loop through this mass of 
for text in text_list:
  # take each word and tokenize it
  w = nltk.word_tokenize(text)
  words.extend(w)
  # adding entities to our class list
  nlp_words = nlp(text)
  classes.extend([x.ent_type_ for x in nlp_words if x.ent_type_ not in classes])

In [None]:
# Lemmatize all the words that aren't in the list of stop words for English. We also only care about the unique words, so we stick the list of words in a set.
# I want each part to be immutable, so I'll make it a list of tuples
word_class = [(lemmatizer.lemmatize(w.lower()), nlp(w)[0].ent_type_) for w in set(words) if w not in stop_words]

# Now for just a list of unique words, use a set, convert to list and sort the words alphabetically
unique_words = sorted(list(set(words)))
# sort classes alphabetically
classes = sorted(list(set(classes)))

# print out for QA
print (len(documents), "documents")

print (len(classes), "classes", classes)

print (len(unique_words), "unique lemmatized words", unique_words)

# pickle some things to save them for later
pickle.dump(unique_words,open('words.pkl','wb'))
pickle.dump(classes,open('classes.pkl','wb'))

0 documents
19 classes ['', 'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']


In [None]:
# In this cell we are creating our training data. It is basically creating a word vector for each word, and corresponding class vector
# initializing training data
training = []

# A few empty lists for form the bags of words and training data.
output_empty = [0] * len(classes)
empty_wordvec = [0] * len(word_class)

for w in range(len(word_class)):
  # creating word vectors for each word
  bag = list(empty_wordvec)
  bag[w] = 1

  # word class is a list of tuples of two elements. The first is the lemmatized word and the second is the Entity name
  output_row = list(output_empty)
  output_row[classes.index(word_class[w][1])] = 1

  # output is a '0' for each tag and '1' for current tag (for each pattern)
  output_row = list(output_empty)
  output_row[classes.index(word_class[w][1])] = 1

  training.append([bag, output_row])

In [None]:
# shuffle our features and turn into np.array. This appears as though it would make learning a little less biased.
random.shuffle(training)
training = np.array(training)
# create train and test lists. X - patterns, Y - intents
train_x = list(training[:,0])
train_y = list(training[:,1])
print("Training data created")

Training data created


In [None]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(256, input_shape=(len(train_x[0]),), activation='tanh'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='sigmoid'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(train_y[0]), activation='softmax'))

In [None]:
# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.02, decay=1e-6, momentum=0.5, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer='Adagrad', metrics=['accuracy'])

#fitting and saving the model
hist = model.fit(np.array(train_x), np.array(train_y), verbose=1, batch_size=10, epochs=20)
model.save('chatbot_model.h5', hist)

print("model created")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
model created


## Grid Search for best model

I'm not terribly sure how exactly this works, and couldn't get it to work, so I abandoned it and went with just changing the parameters myself.

In [None]:
# Make parameters
optimizer = [#'SGD', 
             'RMSprop', 
             'Adagrad', 
             'Adadelta', 
             #'Adam', 
             'Adamax', 
             #'Nadam'
             ]
init_mode = ['uniform', 
             'lecun_uniform', 
             'normal', 
             'zero', 
             #'glorot_normal', 
             #'glorot_uniform', 
             #'he_normal', 
             #'he_uniform'
             ]
activation = ['softmax', 
              #'softplus', 
              #'softsign', 
              'relu', 
              'tanh', 
              'sigmoid', 
              #'hard_sigmoid', 
              #'linear'
              ]
#weight_constraint = [1,2,3,4]
# dropout_rate=[0.0,0.1,0.2,0.3]
#batch_size = [25,50,75,100]

param_grid=dict(optimizer=optimizer,
                init_mode=init_mode,
                activation=activation,
                #weight_constraint=weight_constraint,
                #dropout_rate=dropout_rate,
                #batch_size=batch_size
                )

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
def create_model():
  model = Sequential()
  model.add(Dense(304, input_shape=(len(train_x[0]),), activation='relu'))
  model.add(Dropout(0.3))
  model.add(Dense(76, activation=activation))
  model.add(Dense(len(train_y[0]), activation=activation))

  model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  return model

In [None]:
grid = GridSearchCV(estimator=create_model(), param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(np.array(train_x), np.array(train_y), epochs=10)

ValueError: ignored

# Reload the model and try out chatbot

In [21]:
from keras.models import load_model
model = load_model('chatbot_model.h5')
import json
import random
words = pickle.load(open('words.pkl','rb'))
classes = pickle.load(open('classes.pkl','rb'))

In [None]:
def clean_up_sentence(sentence):
    '''This function takes in a sentence, splits it into words, lemmatizes them and returns the list of words.'''
    sentence_words = nltk.word_tokenize(sentence)
    sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=True):
    '''This function takes in a sentence, a bunch of words and returns a bag of words for words present in that sentence.'''
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words - matrix of N words, vocabulary matrix
    bag = [0]*len(words)
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s:
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)
    return(np.array(bag))

def predict_class(sentence, model):
    ''''''
    # filter out predictions below a threshold
    p = bow(sentence, words,show_details=False)
    res = model.predict(np.array([p]))[0]
    ERROR_THRESHOLD = 0.25
    results = [[i,r] for i,r in enumerate(res) if r>ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
    return return_list

def getResponse(ints, intents_json):
    tag = ints[0]['intent']
    list_of_intents = intents_json['intents']
    for i in list_of_intents:
        if(i['tag']== tag):
            result = random.choice(i['responses'])
            break
    return result

def chatbot_response(msg):
    ints = predict_class(msg, model)
    res = getResponse(ints, intents)
    return res

In [None]:
# We'll need to write a function that outlines different sentences that the chatbot should be able to say back. 
# I'll also need a function to pick out each named entity within the user response. This will involve testing against the model.
# Either within that last function or another fuction, I'll have to make sure to figure out whether an input date or location corresponds to the origin or destination.
# The functions written below, happen in reverse order, since the one at the bottom necessarily need the ones above.

# I'll also want to collect new words 

def clean_up_sentence(sentence):
  '''
  This function takes in a sentence, splits it into words, lemmatizes them and returns the list of words.
  '''
  sentence_words = nltk.word_tokenize(sentence)
  sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
  return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=True):
  '''
  This function takes in a sentence, a bunch of words and returns a bag of words for words present in that sentence.
  It is optional to use the show_details parameter. This tells which words were found in the bag of words. 
  '''
  global new_words
  new_words = words
  # tokenize the pattern with another function
  sentence_words = clean_up_sentence(sentence)
  # bag of words - matrix of N words, vocabulary matrix
  bag = [0]*len(words)
  # loop through words in sentence, assign them a word vector
  for s in sentence:
    if (s in words):
      idx = words.index(s)
      bag[idx] = 1
      if show_details:
          print ("found in bag: %s" % w)
    else:
      new_words = sorted(new_words.append(s))
  return(np.array(bag))

def predict_class(sentence, model):
    '''
    This function takes in the users response, converts each word to a word vector and tests it in the model. 
    Once it knows what each word is, I'll need each of the important words assigned to a variable. It is here I'll also need the function to pick out origin/destination.
    '''
    # filter out predictions below a threshold
    phrase = bow(sentence, words, show_details=False)
    result = [model.predict(np.array([w]))[0] for w in phrase]
    ERROR_THRESHOLD = 0.25
    results = [[i,r] for i,r in enumerate(res) if r>ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
    return return_list

# first, we get the message
def chatbot_response(msg):
  '''
  This function takes in the user message and calls the model to act on it, predicting the class of each of the words.
  Then it produces a response based on the results of the model call.
  '''
  ints = predict_class(msg, model)
  response = getResponse(ints, intents)
  return response

In [8]:
for i, w in enumerate(words):
  print(f'Word {i} is {w}')
  if(i==5):
    break

Word 0 is 0
Word 1 is 00
Word 2 is 000
Word 3 is 0000
Word 4 is 001
Word 5 is 001CD


In [26]:
def clean_up_sentence(sentence):
  '''
  This function takes in a sentence, splits it into words, lemmatizes them and returns the list of words.
  '''
  sentence_words = nltk.word_tokenize(sentence)
  sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]
  return sentence_words

sen = clean_up_sentence('I want to book a trip from Toronto to Bangledesh for $2250')

In [27]:
def bow(sentence, words, show_details=True):
  '''
  This function takes in a sentence, a bunch of words and returns a bag of words for words present in that sentence.
  It is optional to use the show_details parameter. This tells which words were found in the bag of words. 
  '''
  # tokenize the pattern with another function
  #sentence_words = clean_up_sentence(sentence)
  # bag of words - matrix of N words, vocabulary matrix
  bag = [0]*len(words)
  # loop through words in sentence, assign them a word vector
  for s in sentence:
    if (s in words):
      idx = words.index(s)
      bag[idx] = 1
      if show_details:
          print ("found in bag: %s" % w)
    else:
      new_words = sorted(new_words.append(s))
  return(np.array(bag))

bow(sen, words, show_details=True)

found in bag: i
found in bag: want
found in bag: to
found in bag: book
found in bag: a
found in bag: trip
found in bag: from
found in bag: toronto
found in bag: to
found in bag: for


array([0, 0, 0, ..., 0, 0, 0])

In [30]:
words.index(sen[0])

12426

In [35]:
import spacy
from spacy import displacy
text='I want to book a trip from Toronto to vancouver on June 23 for less than 1200 dollars'
nlp=spacy.load('en_core_web_sm')
doc=nlp(text)
displacy.render(doc, style='ent',jupyter=True)

In [38]:
[d.ent_type_ for d in doc]

['',
 '',
 '',
 '',
 '',
 '',
 '',
 'GPE',
 '',
 'GPE',
 '',
 'DATE',
 'DATE',
 '',
 'MONEY',
 'MONEY',
 'MONEY',
 'MONEY']

In [73]:
for turn in travel_txt[0]['turns']:
  print(turn['text'])

I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.
Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?
Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.
I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?
I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?
I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?
I suppose I'll speak with my husband to see if we can choose other dates, and then I'll come back to you.Thanks for your help


In [74]:
len(travel_txt)

1369

In [79]:
for i, text in enumerate(travel_txt):
  if i in range(100,120):
    print(text['turns'][0]['text'])

Hi there! I finally have some vacation time starting on August 22nd and I am hoping to go away somewhere really far away and getting away from it all. Do you have any suggestions for me?
I'd like to book a trip to Monterrey from August 17th to August 25th.
hey i wanna go to hiroshima w my fam
Hi! I'm looking to book a trip somewhere between August 19th and September 3rd. My budget is 1700.
hey, i'm looking to go to Theed from Caprica on August 13th
I'd like to book a trip from London to somewhere, for August 17th to August 31st.
ay whats up?
Good morning.  We are planning a family reunion and since we are spread out all over the world, we have decided on a couple of possibilities. Can you please give me information on packages available in Fukuoka? My husband and I would be leaving from Manaus and we will be travelling with our 4 children.
hi im in mannheim and want to spend a week away
Hi. I'd like to find a trip to San Diego. I can leave from Belo Horizonte. I'd like to travel betwee

In [108]:
convo = 0
for t in range(len(travel_txt[convo]['turns'])):
  print(travel_txt[convo]['turns'][t]['labels']['acts'][0]['name'], ' : ',travel_txt[convo]['turns'][t]['text'])

inform  :  I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.
no_result  :  Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?
inform  :  Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.
no_result  :  I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?
inform  :  I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?
no_result  :  I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?
thankyou  :  I suppose I'll speak with my husband to see if we can choose other dates, and then I'll come back to you.Thanks for your help


In [112]:
travel_txt[convo]['turns'][1]['labels']['acts']

[{'args': [{'key': 'ref',
    'val': [{'annotations': [], 'frame': 1, 'fromrange': False}]}],
  'name': 'no_result'},
 {'args': [{'key': 'dst_city'}], 'name': 'suggest'},
 {'args': [], 'name': 'sorry'}]