tutorial link: https://medium.com/swlh/how-to-design-seq2seq-chatbot-using-keras-framework-ae86d950e91d
github repo: https://github.com/dredwardhyde/Seq2Seq-Chatbot-English

In [None]:
import codecs
import io
import os
import re
import zipfile
import tensorflow as tf

In [2]:
import numpy as np
import requests
from tensorflow.keras import Input, Model
from tensorflow.keras.activations import softmax
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras_preprocessing.text import Tokenizer

In [3]:
url = 'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

In [4]:
def get_all_conversations():
    all_conversations = []
    with codecs.open("./cornell movie-dialogs corpus/movie_lines.txt",
                     "rb",
                     encoding="utf-8",
                     errors="ignore") as f:
        lines = f.read().split("\n")
        for line in lines:
            all_conversations.append(line.split(" +++$+++ "))
    return all_conversations

In [5]:
def get_all_sorted_chats(all_conversations):
    all_chats = {}
    # get only first 10000 conversations from dataset because whole dataset will take 9.16 TiB of RAM
    for tokens in all_conversations[:10000]:
        if len(tokens) > 4:
            all_chats[int(tokens[0][1:])] = tokens[4]
    return sorted(all_chats.items(), key=lambda x: x[0])

In [6]:
def get_conversation_dict(sorted_chats):
    conv_dict = {}
    counter = 1
    conv_ids = []
    for i in range(1, len(sorted_chats) + 1):
        if i < len(sorted_chats):
        # if the current line number differs 
        # from the previous only by 1
            if (sorted_chats[i][0] - sorted_chats[i - 1][0]) == 1:
             # then this line is a part of the current conversation
             # if the previous line was not added before,
             # then we should add it now
                if sorted_chats[i - 1][1] not in conv_ids:
                    conv_ids.append(sorted_chats[i - 1][1])
                # or just append the current line
                conv_ids.append(sorted_chats[i][1])
            # and if the difference is more than 1 - it means new
            # conversation has started and we should clear conv_ids
            elif (sorted_chats[i][0] - sorted_chats[i - 1][0]) > 1:
                conv_dict[counter] = conv_ids
                conv_ids = []
            counter += 1
        else:
            continue
    return conv_dict

In [7]:
def get_clean_q_and_a(conversations_dictionary):
    ctx_and_target = []
    for current_conv in conversations_dictionary.values():
        # make sure that each conversation 
        # contains an even number of lines
        if len(current_conv) % 2 != 0:
            current_conv = current_conv[:-1]
        # convert questions and answers to the list of tuples
        for i in range(0, len(current_conv), 2):
            ctx_and_target.append((current_conv[i], 
                                   current_conv[i + 1]))
    # zip with * operator unzips tuples into independent lists
    context, target = zip(*ctx_and_target)
    context_dirty = list(context)
    # clear questions from contracted forms, non-letter symbols
    # and convert it to lowercase
    clean_questions = list()
    for i in range(len(context_dirty)):
        clean_questions.append(clean_text(context_dirty[i]))
    target_dirty = list(target)
    # do the same with the answers, but now we need
    # to add 'start' and 'end' words
    clean_answers = list()
    for i in range(len(target_dirty)):
        clean_answers.append('<START> ' 
                     + clean_text(target_dirty[i]) 
                     + ' <END>')
    return clean_questions, clean_answers

In [8]:
def clean_text(text_to_clean):
    res = text_to_clean.lower()
    res = re.sub(r"i'm", "i am", res)
    res = re.sub(r"he's", "he is", res)
    res = re.sub(r"she's", "she is", res)
    res = re.sub(r"it's", "it is", res)
    res = re.sub(r"that's", "that is", res)
    res = re.sub(r"what's", "what is", res)
    res = re.sub(r"where's", "where is", res)
    res = re.sub(r"how's", "how is", res)
    res = re.sub(r"\'ll", " will", res)
    res = re.sub(r"\'ve", " have", res)
    res = re.sub(r"\'re", " are", res)
    res = re.sub(r"\'d", " would", res)
    res = re.sub(r"\'re", " are", res)
    res = re.sub(r"won't", "will not", res)
    res = re.sub(r"can't", "cannot", res)
    res = re.sub(r"n't", " not", res)
    res = re.sub(r"n'", "ng", res)
    res = re.sub(r"'bout", "about", res)
    res = re.sub(r"'til", "until", res)
    res = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", res)
    return res

In [9]:
conversations = get_all_conversations()
total = len(conversations)

In [10]:
print("Total conversations in dataset: {}".format(total))
all_sorted_chats = get_all_sorted_chats(conversations)
conversation_dictionary = get_conversation_dict(all_sorted_chats)
questions, answers = get_clean_q_and_a(conversation_dictionary)
print("Questions in dataset: {}".format(len(questions)))
print("Answers in dataset: {}".format(len(answers)))

Total conversations in dataset: 304714
Questions in dataset: 4709
Answers in dataset: 4709


In [11]:
for i in range(20):
    print(questions[i])
    print(answers[i]+'\n')

did you change your hair
<START> no <END>

i missed you
<START> it says here you exposed yourself to a group of freshmen girls <END>

it was a bratwurst  i was eating lunch
<START> with the teeth of your zipper <END>

you the new guy
<START> so they tell me <END>

c'mon  i am supposed to give you the tour
<START> so  which dakota you from <END>

north actually  how would you   
<START> i was kidding people actually live there <END>

yeah  a couple  we are outnumbered by the cows though
<START> how many people were in your old school <END>

thirtytwo
<START> get out <END>

how many people go here
<START> couple thousand most of them evil <END>

that i am used to
<START> yeah but these guys have never seen a horse  they just jack off to clint eastwood <END>

that girl  i 
<START> you burn you pine you perish <END>

who is she
<START> bianca stratford  sophomore do not even think about it <END>

why not
<START> i could start with your haircut but it does not matter  she is not allowed to 

In [12]:
target_regex = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n\'0123456789'
# Tokenizer allows to vectorize our corpus by turning each sentence
# into a sequence of integers where each integer is an index
# of a token in an internal dictionary
tokenizer = Tokenizer(filters=target_regex)
tokenizer.fit_on_texts(questions + answers)
VOCAB_SIZE = len(tokenizer.word_index) + 1
# size of our vocabulary is 7910 words
print('Vocabulary size : {}'.format(VOCAB_SIZE))

Vocabulary size : 7910


In [13]:
tokenized_questions = tokenizer.texts_to_sequences(questions)
# maximum question length is 223 words
maxlen_questions = max([len(x) for x in tokenized_questions])
# pad each question with zeros at the end to be 223 words long
encoder_input_data = pad_sequences(tokenized_questions, 
                                 maxlen=maxlen_questions,
                                 padding='post')
# matrix of 4709x223 integers - 4709 questions 223 words each
print(encoder_input_data.shape)
tokenized_answers = tokenizer.texts_to_sequences(answers)
# maximum answer length is 132 words
maxlen_answers = max([len(x) for x in tokenized_answers])
# pad each answer with zeros at the end to be 132 words long
decoder_input_data = pad_sequences(tokenized_answers,   
                                   maxlen=maxlen_answers,
                                   padding='post')
# matrix of 4709x132 integers - 4709 answers 132 words each
print(decoder_input_data.shape)

(4709, 223)
(4709, 132)


In [22]:
# remove the first 'start' word from every answer
for i in range(len(tokenized_answers)):
    tokenized_answers[i] = tokenized_answers[i][1:]
# pad answers with zeros
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')
# tensor of size (4709, 132, 7910)
# 4709 answers 132 words each, and each word 
# is one-hot encoded using our vocabulary
decoder_output_data= to_categorical(padded_answers, VOCAB_SIZE, dtype=np.int8)

In [24]:
print(decoder_output_data.shape)

(4709, 132, 7910)


In [34]:
# encoder will be used to capture space-dependent 
# relations between words from the questions
enc_inputs = Input(shape=(None,))
enc_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(enc_inputs)
enc_lstm = LSTM(200, return_state=True)
enc_lstm. _could_use_gpu_kernel = False
enc_outputs, state_h, state_c = enc_lstm(enc_embedding)
enc_states = [state_h, state_c]

In [35]:
# decoder will be used to capture space-dependent relations 
# between words from the answers using encoder's 
# internal state as a context
dec_inputs = Input(shape=(None,))
dec_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(dec_inputs)
dec_lstm = LSTM(200, return_state=True, return_sequences=True)
dec_lstm. _could_use_gpu_kernel = False
dec_outputs, _, _ = dec_lstm(dec_embedding,  
                             initial_state=enc_states)

In [36]:
# decoder is connected to the output Dense layer
dec_dense = Dense(VOCAB_SIZE, activation=softmax)
output = dec_dense(dec_outputs)

In [37]:
model = Model([enc_inputs, dec_inputs], output)
model.compile(optimizer=RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 200)    1582000     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, None, 200)    1582000     input_6[0][0]                    
____________________________________________________________________________________________

In [39]:
model.fit([encoder_input_data, decoder_input_data],
          decoder_output_data,
          batch_size=50,
          epochs=300)
model.save('model_big.h5')

Train on 4709 samples
Epoch 1/300
 200/4709 [>.............................] - ETA: 4:22 - loss: 0.8220

KeyboardInterrupt: 