##  General purpose Chatbot

#### 1. Chatbot implemented using Seq2Seq ( Vanilla Encoder-Decoder architecture)
#### 2. Built using LSTMs
#### 3. Implemented in Tensorflow2.0+

In [1]:
# Building a deep NLP chatbot using seq2seq (Encoder Decoder architecture using LSTMs)

# importing the required libraries
import os
import re
import time

import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input
from tensorflow.keras.activations import softmax
from tensorflow.keras.preprocessing.sequence import pad_sequences

####  Dataset

Dataset used for making this general purpose Encoder decoder chatbot is "Cornell Movie dialogs corpus" where different sets of movie dialogs between two characters are provided from different movies.

In [2]:
# Dataset folder path
DATASET_PATH = 'dataset/cornell movie-dialogs corpus/'

In [3]:
#To check whether the Tensorflow is using or identifying the GPUs or not",
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Number of available GPUs : {}".format(len(physical_devices)))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

Number of available GPUs : 1


####  Data Preperation

In [4]:
# Reading the movie lines from the data
lines_buff = open(os.path.join(DATASET_PATH,'movie_lines.txt'), mode='r', encoding='latin-1')
lines = lines_buff.read().split('\n')
lines_buff.close()

# Reading the conversation mappings from the data
converasations_buff = open(os.path.join(DATASET_PATH,'movie_conversations.txt'), mode='r', encoding='latin-1')
converasations = converasations_buff.read().split('\n')
converasations_buff.close()

In [5]:
# Mapping each dialogie with the corresponding dialogue code in a dictionary for accessing easily    
mapping_id2line = {}
lines = [line.split(' +++$+++ ') for line in lines]
for line in lines :
    if len(line) == 5:
        mapping_id2line[line[0]] = line[-1]

In [6]:
# Verify the mapping of dialogies
# mapping_id2line

In [7]:
# list of all conversations 
conversations_list = [converasation.split(' +++$+++ ')[-1][1:-1].strip().replace("'","").replace(" ","").split(',') for converasation in converasations]

In [8]:
# Verify the list of all conversations
# conversations_list

In [9]:
# Create questions and answers list to be trained based on the conversation pattern observed in data
questions = []
answers = []

for conversation in conversations_list[:-1] :
    for i in range(len(conversation)-1): 
                questions.append(mapping_id2line[conversation[i]])
                answers.append(mapping_id2line[conversation[i+1]])

In [10]:
# Taking first 1 million dialogues to be used as questions from all questions list
questions = questions[:100000]

In [11]:
# Verify the questions list
# questions

In [12]:
# Taking first 1 million dialogues to be used as answers from all answers list 
answers = answers[:100000]

In [13]:
# Verifying answers
# answers

###  Data Cleaning

In [14]:
def clean_text(text):
    """ Function to clean the input text and return the cleaned text
    """
    # lower case all text
    text = text.lower()
    
    # replace short words with complete words for consistency
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    
    # remove all non essential charachters
    text = re.sub(r"[-{}\"#/@;:<>()+=`|.?,]","", text)
    #Or this to remove all characters except alphabets
#     text = re.sub("[^a-zA-Z]", " ", text)

    return text

In [15]:
# Create the list of cleaned questions
questions_cleaned = [clean_text(question) for question in questions]

In [16]:
# Verify cleaned questions
# questions_cleaned

In [17]:
# Create the list of cleaned anwers
answers_cleaned = [clean_text(answer) for answer in answers]

In [18]:
# Verify cleaned answer
# answers_cleaned

In [19]:
# truncating questions for maximum of 26 words
truncated_questions = []
for question in questions_cleaned:
    truncated_questions.append(' '.join(question.split()[:26]).strip())

In [20]:
# truncating answers for maximum of 26 words
truncated_answers = []
for answer in answers_cleaned:
    truncated_answers.append(' '.join(answer.split()[:26]).strip())

In [21]:
# Updating the cleaned questions and cleaned answers with truncated sentences
questions_cleaned = truncated_questions
answers_cleaned = truncated_answers

####  Remove rare words

In [22]:
# Create a word count dictionary for question and answer vocabulary to find the rarely occuring words
count_mapping_dict = dict()
def data_imporvement(text):
    for word in text.split():
        if word not in count_mapping_dict:
            count_mapping_dict[word] = 1
        else:
            count_mapping_dict[word] += 1

In [23]:
# find the question vocab counts
for text in questions_cleaned:
    data_imporvement(text)

In [24]:
# find the answers vocab counts
for text in answers_cleaned:
    data_imporvement(text)

In [25]:
# Verify the word count dictionary
# count_mapping_dict

In [26]:
# Creating threshold to filter out less frequent words
# Filter out words from vocabulary whose count is less than the threshold
# create a words to int dictionary for question words
threshold = 40
questionwords2int ={}
word_number = 0
for word, count in count_mapping_dict.items():
         if count > threshold:
            questionwords2int[word] =  word_number
            word_number += 1

In [27]:
# Verify question words vocabulary to int mapping dictionary
# questionwords2int

In [28]:
# Verify the length of mapping dict or the number of words in the question vocabulary
len(questionwords2int)

2613

In [29]:
# Creating threshold to filter out less frequent words
# Filter out words from vocabulary whose count is less than the threshold
# create a words to int dictionary for answer words
threshold = 40
answerwords2int = {}
word_number = 0
for word, count in count_mapping_dict.items():
         if count > threshold:
            answerwords2int[word] =  word_number
            word_number += 1

In [30]:
# Verify answer words vocabulary to int mapping dictionary
# answerwords2int

In [31]:
# Verify the length of mapping dict or the number of words in the answer vocabulary
len(answerwords2int)

2613

####  Add tokens for data preparing

In [32]:
# Token are used while encoding and decoding

In [33]:
# <SOS> Start of string
# <EOS> End of String
# <PAD> for maintaining the length of input
# <OUT> for words not used while training(filter out)
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']

# adding token and corresponding integer mapping to existing word to integer mapping for questions
for token in tokens:
    questionwords2int[token] = len(questionwords2int)

In [34]:
# adding token and corresponding integer mapping to existing word to integer mapping for answers
for token in tokens:
    answerwords2int[token] = len(answerwords2int)

#### Creating an inverse index to words dictionary

In [35]:
# create an inverse dictionary of answers2int for decoder so that after predicting the owrd index we can create the words
int2answerwords = {i:w for w,i in answerwords2int.items()}

#### Adding  tags for decoder training

In [36]:
# Adding <SOS> at the start of answers sentence and <EOS> at the end of every answer sentence for decoder to learn where to stop

In [37]:
for i in range(len(answers_cleaned)):
    answers_cleaned[i] = '<SOS> '+ answers_cleaned[i] + ' <EOS>'

In [38]:
# Verify the updated data
# answers_cleaned

#### Translate all the question and answers into integers

In [39]:
# Converting question setences to integer encoding
question_to_int = []
for question in questions_cleaned:
    ints = []
    for word in question.split():
        if word not in questionwords2int:
            ints.append(questionwords2int['<OUT>'])
        else:
            ints.append(questionwords2int[word])
    question_to_int.append(ints)

In [40]:
# Converting answer setences to integer encoding
answer_to_int = []
for answer in answers_cleaned:
    ints = []
    for word in answer.split():
        if word not in answerwords2int:
            ints.append(answerwords2int['<OUT>'])
        else:
            ints.append(answerwords2int[word])
    answer_to_int.append(ints)

In [41]:
# Calculate the length of longest question to be used for encode model input
question_sequence_length = max([len(question) for question in question_to_int])
question_sequence_length

26

In [42]:
# Calculate the length of longest answer to be used for decoder model input
answer_sequence_length = max([len(answer) for answer in answer_to_int])
answer_sequence_length

28

In [43]:
# Create the LSTM based Encoder-Decoder model using Keras Functional API
# dimension of embedding layer
EMBED_HID_DIM = 100

# dimension of LSTM unit
latent_dim = 100

# Size of questions and answers vocab
vocab_questions = len(questionwords2int)
vocab_answers = len(answerwords2int)

# Encoder Model creationS
# Define an input shape.
encoder_inputs = Input(shape=(None,))

# Define the embedding layer
inputs_embed = Embedding(input_dim=vocab_questions, output_dim=EMBED_HID_DIM, input_length=question_sequence_length)
encoder_embed = inputs_embed(encoder_inputs)

# DEFINE THE LSTM layer
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embed)

# create the encoder model
model = Model(encoder_inputs, encoder_outputs)


# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.

#decoder input shape
decoder_inputs = Input(shape=(None,))

# define decoder embedding layer
decode_inputs_embed = Embedding(input_dim=vocab_answers, output_dim=EMBED_HID_DIM, input_length=answer_sequence_length)
decoder_embed = decode_inputs_embed(decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embed,
                                     initial_state=encoder_states)

# Final classifier layer Dense and softmax activated
decoder_dense = Dense(vocab_answers, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 100)    261700      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    261700      input_2[0][0]                    
_______________________________________________________________________________________

In [44]:
# Compile the Keras model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='acc')

#### Data preperation for model training 

In [45]:
# Padding of Integer coded question and answers with <PAD> sequence
def padding(encoder_sequences, decoder_sequences):
    
    encoder_input_data = pad_sequences(encoder_sequences, maxlen=question_sequence_length, dtype='int32', padding='post', truncating='post', value= questionwords2int['<PAD>'])
    decoder_input_data = pad_sequences(decoder_sequences, maxlen=answer_sequence_length, dtype='int32', padding='post', truncating='post', value= answerwords2int['<PAD>'])
  
    return encoder_input_data, decoder_input_data

encoder_input_data, decoder_input_data = padding(question_to_int, answer_to_int)

In [46]:
# Verify Encoder input after padding
# encoder_input_data

In [47]:
# Verify encoder input shape
encoder_input_data.shape

(100000, 26)

In [48]:
# Verify Decoder input after padding
# decoder_input_data

In [49]:
# Verify encoder input shape
decoder_input_data.shape

(100000, 28)

In [50]:
# Function to generate batch of training data based on batch size becuase dataset is too big to fit in memory
# encoder_input_data = as is it
# decoder_input data = as it is
# decoder_target_data = offset by one timestep

max_source_length = question_sequence_length
max_target_length = answer_sequence_length
num_decoder_tokens = vocab_answers

def generate_batch(X , y , batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):

            encoder_input_data = np.zeros((batch_size, max_source_length),dtype='int32')
            decoder_input_data = np.zeros((batch_size, max_target_length),dtype='int32')
            decoder_target_data = np.zeros((batch_size, max_target_length, num_decoder_tokens),dtype='int32')

            for i, (input_seq, target_seq) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_seq):
                    encoder_input_data[i, t] = word
                for t, word in enumerate(target_seq):
                    if t<len(target_seq)-1:
                        decoder_input_data[i, t] = word # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        #print(word)
                        decoder_target_data[i, t - 1, word] = 1.


            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [51]:
# Model training configuration
train_samples = len(question_to_int) # Total Training samples
batch_size = 128
epochs = 25

In [52]:
X_train = encoder_input_data
y_train = decoder_input_data

In [53]:
# Model fit
model.fit(generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size, epochs=epochs)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x1a4b5ac42c8>

In [59]:
# Save the model to disk
model.save('model/LSTM_chatbot.h5')

####  Inference with Model

In [60]:
# Encode the input sequence to get the "Context vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup

# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]

# Get the embeddings of the decoder sequence
dec_emb2 = decode_inputs_embed(decoder_inputs)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_state_input)
decoder_states2 = [state_h2, state_c2]

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_state_input,
    [decoder_outputs2] + decoder_states2)

In [61]:
# Function to decode the sequence from a decoder given the input sequence
def decode_sequence(input_seq):
        # Encode the input as state vectors.
        states_value = encoder_model.predict(input_seq)
        
        # Generate empty target sequence of length 1.
        target_seq = np.zeros((1,1))
        
        # Populate the first character of 
        #target sequence with the start character.
        target_seq[0, 0] = answerwords2int['<SOS>']
        
        # Sampling loop for a batch of sequences
        # (to simplify, here we assume a batch of size 1).
        stop_condition = False
        decoded_sentence = ''
        
        while not stop_condition:
            output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
            
            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_word =int2answerwords[sampled_token_index]
            decoded_sentence += ' '+ sampled_word
            
            # Exit condition: either hit max length
            # or find stop character.
            if (sampled_word == '<EOS>' or len(decoded_sentence.split()) > 25):
                stop_condition = True
        
            # Update the target sequence (of length 1).
            target_seq = np.zeros((1,1))
            target_seq[0, 0] = sampled_token_index
            
            # Update states
            states_value = [h, c]
        return decoded_sentence

#### Make inference on training data

In [62]:
# Create a batch generator for batch size 1
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [105]:
# Predict the target sentence and compare with the actual target sentence given a source sentence
k+=1
(input_seq, actual_output), target_output = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input Source sentence:',questions_cleaned[k:k+1][0])
print('Actual Target Sentence:', answers_cleaned[k:k+1][0][5:-5])
print('Predicted Target Sentence:', decoded_sentence[:-5].strip())

Input Source sentence: she okay
Actual Target Sentence:  i hope so 
Predicted Target Sentence: i am not sure
