In [1]:
# Import library
import io
import os
import re
import zipfile

import numpy as np
import requests
import yaml # data serialisation

from gensim.models import Word2Vec
from keras import Input, Model
from keras.activations import softmax
from keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import RMSprop
from keras. preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras_preprocessing.text import Tokenizer

In [3]:
# data preparation
res = requests.get('https://uplevelsg.s3.ap-southeast-1.amazonaws.com/CommonAssets/chatbot_nlp.zip') # small dataset
z = zipfile.ZipFile(io.BytesIO(res.content))
z.extractall()

In [4]:
# get a list of yml files
dir_path = 'chatbot_nlp/data'
files_list = os.listdir(dir_path + os.sep) # os.sep = /
files_list

['humor.yml',
 'greetings.yml',
 'food.yml',
 'computers.yml',
 'money.yml',
 'sports.yml',
 'gossip.yml',
 'science.yml',
 'psychology.yml',
 'trivia.yml',
 'ai.yml',
 'literature.yml',
 'emotion.yml',
 'history.yml',
 'movies.yml',
 'health.yml',
 'botprofile.yml',
 'politics.yml']

In [5]:
len(files_list)

18

In [7]:
# Let's take a quick look at the yml
test_yml = files_list[1]
test_stream = open(dir_path + "/" + test_yml, 'rb') # raw binary
test_doc = yaml.safe_load(test_stream)
test_doc

{'categories': ['greetings'],
 'conversations': [['Hello', 'Hi'],
  ['Hi', 'Hello'],
  ['Greetings!', 'Hello'],
  ['Hello', 'Greetings!'],
  ['Hi, How is it going?', 'Good'],
  ['Hi, How is it going?', 'Fine'],
  ['Hi, How is it going?', 'Okay'],
  ['Hi, How is it going?', 'Great'],
  ['Hi, How is it going?', 'Could be better.'],
  ['Hi, How is it going?', 'Not so great.'],
  ['How are you doing?', 'Good.'],
  ['How are you doing?', 'Very well, thanks.'],
  ['How are you doing?', 'Fine, and you?'],
  ['Nice to meet you.', 'Thank you.'],
  ['How do you do?', "I'm doing well."],
  ['How do you do?', "I'm doing well. How are you?"],
  ['Hi, nice to meet you.', 'Thank you. You too.'],
  ['It is a pleasure to meet you.', 'Thank you. You too.'],
  ['Top of the morning to you!', 'Thank you kindly.'],
  ['Top of the morning to you!', 'And the rest of the day to you.'],
  ["What's up?", 'Not much.'],
  ["What's up?", 'Not too much.'],
  ["What's up?", 'Not much, how about you?'],
  ["What's up?

In [8]:
def clean_text(text_to_clean):
    res = text_to_clean.lower()
    res = re.sub(r"i'm", "i am", res)
    res = re.sub(r"he's", "he is", res)
    res = re.sub(r"she's", "she is", res)
    res = re.sub(r"it's", "it is", res)
    res = re.sub(r"that's", "that is", res)
    res = re.sub(r"what's", "what is", res)
    res = re.sub(r"where's", "where is", res)
    res = re.sub(r"how's", "how is", res)
    res = re.sub(r"\'ll", " will", res)
    res = re.sub(r"\'ve", " have", res)
    res = re.sub(r"\'re", " are", res)
    res = re.sub(r"\'d", " would", res)
    res = re.sub(r"\'re", " are", res)
    res = re.sub(r"won't", "will not", res)
    res = re.sub(r"can't", "cannot", res)
    res = re.sub(r"n't", " not", res)
    res = re.sub(r"n'", "ng", res)
    res = re.sub(r"'bout", "about", res)
    res = re.sub(r"'til", "until", res)
    res = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", res)
    return res

In [10]:
# Create a list of questions
questions = []

# Create a list of answers
answers = []

# Loop through each yml file
for filepath in files_list:

  # Open up a stream
  stream = open(dir_path + "/" + filepath, 'rb')

  # Load yml file into a variable
  doc = yaml.safe_load(stream)

  # Retrieve conversation from yaml file
  conversations = doc['conversations']

  # Loop through the conversations
  for conversation in conversations:
    # If conversation has 3 items or more
    if len(conversation) > 2:
      questions.append(conversation[0])

      # Add the rest of the list as answers
      ans = ''
      for rep in conversation[1:]:
        ans += ' ' + rep
      
      answers.append(ans)

    elif len(conversation) > 1:
      questions.append(conversation[0])
      answers.append(conversation[1])

In [11]:
answers

['Did you hear the one about the mountain goats in the andes? It was "ba a a a a a d".',
 "I never forget a face, but in your case I'll make an exception.",
 'It is better to be silent and be thought a fool, than to open your mouth and remove all doubt.',
 "O'm a not a comedy why don't you check out a joke?",
 'two vultures boarded a plane, each carrying two dead raccoons. the  stewardess stops them and says "sorry sir, only one carrion per  passenger." ',
 'what did the buddhist say to the hot dog vendor?  "make me one with everthing." ',
 'nasa recently sent a number of holsteins into orbit for experimental purposes. they called it the herd shot round the world. ',
 'two boll weevils grew up in s. carolina. one took off to hollywood  and became a rich star. the other stayed in carolina and never amounted  to much -- and naturally became known as the lesser of two weevils. ',
 "Two eskimos in a kayak were chilly, so they started a fire, which sank the craft, proving the old adage you 

Need to tag text:

"I am A"

"<START> I am A <END>"

In [12]:
# Further cleaning
answers_with_tags = []

# Loop through list of answers
for i in range(len(answers)):
  # Make sure answer is a string
  if type(answers[i]) == str:
    answers_with_tags.append(answers[i])
  
  # Remove corresponding question
  else:
    questions.pop(i)

In [13]:
# Create a new answers list
answers = []

# Add <START> and <END> tag to the sentences
for i in range(len(answers_with_tags)):
  answers.append('<START> ' + answers_with_tags[i] + ' <END>')

In [14]:
answers

['<START> Did you hear the one about the mountain goats in the andes? It was "ba a a a a a d". <END>',
 "<START> I never forget a face, but in your case I'll make an exception. <END>",
 '<START> It is better to be silent and be thought a fool, than to open your mouth and remove all doubt. <END>',
 "<START> O'm a not a comedy why don't you check out a joke? <END>",
 '<START> two vultures boarded a plane, each carrying two dead raccoons. the  stewardess stops them and says "sorry sir, only one carrion per  passenger."  <END>',
 '<START> what did the buddhist say to the hot dog vendor?  "make me one with everthing."  <END>',
 '<START> nasa recently sent a number of holsteins into orbit for experimental purposes. they called it the herd shot round the world.  <END>',
 '<START> two boll weevils grew up in s. carolina. one took off to hollywood  and became a rich star. the other stayed in carolina and never amounted  to much -- and naturally became known as the lesser of two weevils.  <END>'

In [15]:
# Check if the lengths are equal
print(len(questions))
print(len(answers))

564
564


## Model Training

In [16]:
combined_sentences = questions + answers

In [17]:
# Declare a regex
target_regex = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n\'0123456789'

# Declare a tokenizer and filter away all unwanted characters
tokenizer = Tokenizer(filters = target_regex)

# Fit tokenizer on both questions and answers
tokenizer.fit_on_texts(combined_sentences)

# Get vocab size
VOCAB_SIZE = len(tokenizer.word_index) + 1

In [18]:
tokenizer.word_index

{'end': 1,
 'start': 2,
 'you': 3,
 'i': 4,
 'a': 5,
 'the': 6,
 'is': 7,
 'of': 8,
 'to': 9,
 'what': 10,
 'are': 11,
 'do': 12,
 'not': 13,
 'and': 14,
 'me': 15,
 'it': 16,
 'that': 17,
 'in': 18,
 'have': 19,
 'can': 20,
 't': 21,
 'am': 22,
 'tell': 23,
 'as': 24,
 'get': 25,
 'my': 26,
 'when': 27,
 'm': 28,
 'your': 29,
 'how': 30,
 'joke': 31,
 's': 32,
 'like': 33,
 'be': 34,
 'an': 35,
 'about': 36,
 'feel': 37,
 'computer': 38,
 'who': 39,
 'or': 40,
 'for': 41,
 'don': 42,
 'by': 43,
 'no': 44,
 'cross': 45,
 'with': 46,
 'software': 47,
 'on': 48,
 'all': 49,
 'much': 50,
 'think': 51,
 'but': 52,
 'he': 53,
 'very': 54,
 'which': 55,
 'at': 56,
 'know': 57,
 'any': 58,
 'why': 59,
 'could': 60,
 'was': 61,
 'so': 62,
 'we': 63,
 'one': 64,
 'should': 65,
 'from': 66,
 'make': 67,
 'more': 68,
 'if': 69,
 'robots': 70,
 'will': 71,
 'stock': 72,
 'favorite': 73,
 'did': 74,
 'die': 75,
 'hal': 76,
 'say': 77,
 'emotion': 78,
 'been': 79,
 'human': 80,
 'robot': 81,
 'does'

In [19]:
VOCAB_SIZE

1856

In [21]:
# Turn questions into tokens
# hold ctrl + space
tokenized_questions = tokenizer.texts_to_sequences(questions)

# Get max length of questions
maxlen_questions = max([len(x) for x in tokenized_questions])

# Get a padded array of words based on the max length of questions
encoder_input_data = pad_sequences(tokenized_questions,
                                   maxlen = maxlen_questions,
                                   padding = 'post')

In [23]:
encoder_input_data.shape

(564, 22)

In [24]:
# Do the same for answers
tokenized_answers = tokenizer.texts_to_sequences(answers)

# Get max length of answers
maxlen_answers = max([len(x) for x in tokenized_answers])

# Get a padded array of words based on max length of answers
decoder_input_data = pad_sequences(tokenized_answers,
                                   maxlen = maxlen_answers,
                                   padding = 'post')

In [25]:
decoder_input_data.shape

(564, 74)

In [26]:
for i in range(len(tokenized_answers)):
  tokenized_answers[i] = tokenized_answers[i][1:]

padded_answers = pad_sequences(tokenized_answers,
                               maxlen = maxlen_answers,
                               padding= 'post')

decoder_output_data = to_categorical(padded_answers, VOCAB_SIZE)

decoder_output_data.shape

(564, 74, 1856)

In [27]:
decoder_output_data

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

## Model Building (Part II)

In [32]:
# Define encoder
enc_inputs = Input(shape = (None,))

# Define embedding layer with input_dim, output_dim
enc_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(enc_inputs)

# Get hidden state and cell state from LSTM
_, state_h, state_c = LSTM(200, return_state=True)(enc_embedding)

# Store states in a list
enc_states = [state_h, state_c]

# Define decoder
dec_inputs = Input(shape = (None,))
dec_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(dec_inputs)
dec_lstm = LSTM(200, return_state = True, return_sequences=True)

# Initialize LSTM layer with encoder's states
dec_outputs, _, _ = dec_lstm(dec_embedding, initial_state=enc_states)

# Add a dense layer
dec_dense = Dense(VOCAB_SIZE, activation= softmax)

output = dec_dense(dec_outputs)

model = Model([enc_inputs, dec_inputs], output)
model.compile(optimizer = RMSprop(), loss = 'categorical_crossentropy')

model.summary()

model.fit([encoder_input_data, decoder_input_data],
          decoder_output_data,
          batch_size = 50,
          epochs = 300)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 200)    371200      input_6[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 200)    371200      input_7[0][0]                    
______________________________________________________________________________________________

<keras.callbacks.History at 0x7f50492b3e90>

# Model Usage

In [33]:
# Define inference model
enc_model = Model(inputs = enc_inputs, outputs = enc_states)

In [35]:
dec_state_input_h = Input(shape = (200, ))
dec_state_input_c = Input(shape = (200, ))

dec_states_inputs = [dec_state_input_h, dec_state_input_c]

dec_outputs, state_h, state_c = dec_lstm(dec_embedding, initial_state = dec_states_inputs)

dec_states = [state_h, state_c]

dec_outputs = dec_dense(dec_outputs)

dec_model = Model(inputs = [dec_inputs] + dec_states_inputs,
                  outputs = [dec_outputs] + dec_states)

dec_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 200)    371200      input_7[0][0]                    
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 200)]        0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           [(None, 200)]        0                                            
____________________________________________________________________________________________

In [36]:
def str_to_tokens(sentence: str):
    words = sentence.lower().split()
    tokens_list = list()
    for current_word in words:
        result = tokenizer.word_index.get(current_word, '')
        if result != '':
            tokens_list.append(result)
    return pad_sequences([tokens_list],
                         maxlen=maxlen_questions,
                         padding='post')

In [37]:
str_to_tokens("hello how are you")

array([[268,  30,  11,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [40]:
states_values = enc_model.predict(str_to_tokens(input('Enter a question: ')))

empty_target_seq = np.zeros((1,1))
empty_target_seq[0,0] = tokenizer.word_index['start']

# Set while loop breaking condition
stop_condition = False
decoded_translation = ''

# Loop and generate words
while not stop_condition:
  dec_outputs, h, c = dec_model.predict([empty_target_seq] + states_values)
  sampled_word_index = np.argmax(dec_outputs[0, -1, :])
  sampled_word = None

  for word, index in tokenizer.word_index.items():
    if sampled_word_index == index:
      if word != 'end':
        decoded_translation += ' {}'.format(word)
      sampled_word = word

  if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
    stop_condition = True

  empty_target_seq = np.zeros((1, 1))
  empty_target_seq[0, 0] = sampled_word_index
  states_values = [h, c]

print(decoded_translation)

Enter a questions: how are you
 i am quite immature
