In [1]:
import string 
import re
import itertools
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from random import randint
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model 

from tensorflow.keras.layers import (LSTM,
                                     GRU, 
                                     Embedding, 
                                     Dense, 
                                     Input,
                                     AdditiveAttention,
                                     Layer)
from tensorflow import ones, shape
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pshen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|]", "", text)
    text = " ".join(text.split())
    return text

In [3]:
with open('./ChatBot_Dataset/movie_lines.txt', 'r', encoding='cp1252') as movie_lines:
    lines = movie_lines.read().split('\n')
    
movie_line_dict = {}
for line in lines:
    text = line.split(' +++$+++ ')[-1]
    index = line.split(' +++$+++ ')[0]
    clean_line = clean_text(text)
    tokinized_text = nltk.word_tokenize(clean_line)
    movie_line_dict[index] = tokinized_text
        

with open('./ChatBot_Dataset/movie_conversations.txt', 'r', encoding='cp1252') as movie_conversation:
    lines = movie_conversation.read().split('\n')
movie_conversation_list = [line.split(' +++$+++ ')[-1][1:-1].replace("'", '').split(', ') for line in lines]

In [4]:
encoder_input, decoder_input, decoder_target  = [], [], []

for conv in movie_conversation_list:
    for line in range(len(conv)-1): 
        encoder_input.append(movie_line_dict[conv[line]])
        decoder_input.append(movie_line_dict[conv[line+1]])
        decoder_target.append(movie_line_dict[conv[line+1]])

In [6]:
len_seq_enc = [len(seq) for seq in encoder_input]
len_seq_dec = [len(seq) for seq in decoder_input]

for percentile in [30, 40, 80, 90]:
    print('{} percentile of encoder length sequances = {}'.
          format(percentile, np.percentile(len_seq_enc, percentile)))
    print('{} percentile of encoder length dequances = {}'.
          format(percentile, np.percentile(len_seq_dec, percentile)), end = '\n\n')
    
enc_inp_trimmed, dec_inp_trimmed, dec_tar_trimmed = [], [], []

MIN_SEQ_LEN = 2
MAX_SEQ_LEN = 20

for i in range(len(encoder_input)):
    if (len(encoder_input[i]) >= MIN_SEQ_LEN and len(encoder_input[i]) <= MAX_SEQ_LEN)\
    and (len(decoder_input[i]) >= MIN_SEQ_LEN and len(decoder_input[i]) <= MAX_SEQ_LEN):
        enc_inp_trimmed.append(encoder_input[i])
        dec_inp_trimmed.append(decoder_input[i])
        dec_tar_trimmed.append(decoder_target[i])


print(f'sample shape encoder input = {len(enc_inp_trimmed)}')
print(f'sample shape decoder input = {len(dec_inp_trimmed)}')
print(f'sample shape decoder target = {len(dec_tar_trimmed)}')

30 percentile of encoder length sequances = 6.0
30 percentile of encoder length dequances = 6.0

40 percentile of encoder length sequances = 7.0
40 percentile of encoder length dequances = 7.0

80 percentile of encoder length sequances = 19.0
80 percentile of encoder length dequances = 19.0

90 percentile of encoder length sequances = 27.0
90 percentile of encoder length dequances = 29.0

sample shape encoder input = 148465
sample shape decoder input = 148465
sample shape decoder target = 148465


In [7]:
num_samples = 30000

result_enc_inp = enc_inp_trimmed[:num_samples]
result_dec_inp = dec_inp_trimmed[:num_samples]
result_dec_tar = dec_tar_trimmed[:num_samples]


In [8]:
words = list(itertools.chain(*list(result_enc_inp), *list(result_dec_inp)))
value, count = np.unique(words, return_counts = True)
word_df = pd.DataFrame({'value': value, 'count': count})
word_df.sort_values('count', ascending = False, inplace = True)


for percentile in [80, 85, 90, 95, 98, 99]:
    print('percent {} = {}'.format(percentile, np.percentile(word_df['count'], percentile)))
    
threshold = 18

word_for_vocab = word_df[word_df['count'] > threshold]['value'].values
vocab = {item: num+4 for num, item in enumerate(word_for_vocab)}
vocab['<START>'] = 0
vocab['<PAD>'] = 1
vocab['<UNK>'] = 2
vocab['<FINISH>'] = 3

VOCAB_SIZE = len(vocab)

print('\n=====================================================\nVocab element exaple: ')

inv_vocab = {num: item for item, num in vocab.items()}
for i in range(6):
    print(inv_vocab[i])
    
print(f'\n=====================================================\nTotal vocab size = {VOCAB_SIZE}')

percent 80 = 7.0
percent 85 = 10.0
percent 90 = 17.0
percent 95 = 42.0
percent 98 = 144.42000000000007
percent 99 = 353.8399999999965

Vocab element exaple: 
<START>
<PAD>
<UNK>
<FINISH>
.
?

Total vocab size = 1410


In [9]:
def char2idx(conv: list, inp = False, tar = False):
    
    if inp:
        return np.array([0] + [2 if elem not in vocab.keys() else vocab[elem] for elem in conv])
    
    if tar:
        return np.array([2 if elem not in vocab.keys() else vocab[elem] for elem in conv] + [3])
    
    return np.array([2 if elem not in vocab.keys() else vocab[elem] for elem in conv])

def idx2char(conv):
    return ['<UNK>' if elem not in inv_vocab.keys() else inv_vocab[elem] for elem in conv]


In [10]:
result_enc_inp_idx = [char2idx(conv) for conv in result_enc_inp]
result_dec_inp_idx = [char2idx(conv, inp=True) for conv in result_dec_inp]
result_dec_tar_idx = [char2idx(conv, tar=True) for conv in result_dec_tar]    

In [11]:
print('encoder_input_seqs:\n', idx2char(result_enc_inp_idx[0]), end = '\n==============================\n')
print('decoder_input_seqs:\n', idx2char(result_dec_inp_idx[0]), end = '\n==============================\n')
print('decoder_target_seqs:\n', idx2char(result_dec_tar_idx[0]), end = '\n==============================\n')

encoder_input_seqs:
 ['well', ',', 'i', 'thought', 'we', 'would', 'start', 'with', '<UNK>', ',', 'if', 'that', 'is', 'okay', 'with', 'you', '.']
decoder_input_seqs:
 ['<START>', 'not', 'the', '<UNK>', 'and', '<UNK>', 'and', '<UNK>', 'part', '.', 'please', '.']
decoder_target_seqs:
 ['not', 'the', '<UNK>', 'and', '<UNK>', 'and', '<UNK>', 'part', '.', 'please', '.', '<FINISH>']


In [12]:
encoder_input_seqs = pad_sequences(
    result_enc_inp_idx,
    value=vocab['<PAD>'],
    padding='post',
    truncating='post',
    maxlen=MAX_SEQ_LEN)

decoder_input_seqs = pad_sequences(
    result_dec_inp_idx,
    value=vocab['<PAD>'],
    padding='post',
    truncating='post',
    maxlen=MAX_SEQ_LEN)

decoder_target_seqs = pad_sequences(
    result_dec_tar_idx,
    value=vocab['<PAD>'],
    padding='post',
    truncating='post',
    maxlen=MAX_SEQ_LEN)


print('encoder_input_seqs shape: ', encoder_input_seqs.shape)
print('decoder_input_seqs shape: ', decoder_input_seqs.shape)
print('decoder_target_seqs shape: ', decoder_target_seqs.shape)

encoder_input_seqs shape:  (30000, 20)
decoder_input_seqs shape:  (30000, 20)
decoder_target_seqs shape:  (30000, 20)


In [14]:
print('encoder_input_seqs:\n', idx2char(encoder_input_seqs[10]), end = '\n==============================\n')
print('decoder_input_seqs:\n', idx2char(decoder_input_seqs[10]), end = '\n==============================\n')
print('decoder_target_seqs:\n', idx2char(decoder_target_seqs[10]), end = '\n==============================\n')

encoder_input_seqs:
 ['hi', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
decoder_input_seqs:
 ['<START>', 'looks', 'like', 'things', 'worked', 'out', 'tonight', ',', 'huh', '?', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
decoder_target_seqs:
 ['looks', 'like', 'things', 'worked', 'out', 'tonight', ',', 'huh', '?', '<FINISH>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [15]:
enc_train, enc_val, dec_tar_train, dec_tar_val, dec_inp_train, dec_inp_val = train_test_split(
    *(encoder_input_seqs, decoder_target_seqs, decoder_input_seqs),
    train_size = 0.9)

print('enx_train shape: ', enc_train.shape)
print('enx_val shape: ', enc_val.shape)
print('dec_tar_train shape: ', dec_tar_train.shape)
print('dec_tar_val shape: ', dec_tar_val.shape)
print('dec_inp_train shape: ', dec_inp_train.shape)
print('dec_inp_train shape: ', dec_inp_train.shape)


enx_train shape:  (27000, 20)
enx_val shape:  (3000, 20)
dec_tar_train shape:  (27000, 20)
dec_tar_val shape:  (3000, 20)
dec_inp_train shape:  (27000, 20)
dec_inp_train shape:  (27000, 20)


In [24]:
EMB_SIZE = 256
H_SIZE = 512


class Encoder(Model):
    def __init__(self):
        super().__init__()
        self.emb = Embedding(VOCAB_SIZE, EMB_SIZE)
        self.lstm = LSTM(H_SIZE, return_sequences = False, return_state = True)

    def call(self, inp):
        out = self.emb(inp)
        _, h, c = self.lstm(out)
        return (h, c)
    
class Decoder(Model):
    def __init__(self):
        super().__init__()
        self.emb = Embedding(VOCAB_SIZE, EMB_SIZE)
        self.lstm = LSTM(H_SIZE, return_sequences = True, return_state = True)
        self.fc = Dense(VOCAB_SIZE, activation = 'softmax')
        
    def call(self, inp, init_state):
        out = self.emb(inp)
        out, h, c = self.lstm(out, initial_state = init_state)
        out = self.fc(out)
        return out, (h, c)
    
def build_model():
    
    encoder = Encoder()
    decoder = Decoder()


    encoder_inputs = Input(shape = (None,) )
    decoder_inputs = Input(shape = (None,) )

    enc_state = encoder(encoder_inputs)
    decoder_outputs, _ = decoder(decoder_inputs, enc_state)
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(loss = loss, optimizer = optimizer, metrics = ['accuracy'])
    

    return (model,encoder,decoder)

model, _, _ = build_model()

In [18]:
BATCH_SIZE = 256
EPOCHs = 150

checkpoint_filepath = './model/auto_checkpoint/'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    save_freq = 10,)

loss = SparseCategoricalCrossentropy()
optimizer = Adam(learning_rate = .003)

model.fit([enc_train, dec_inp_train], 
          dec_tar_train,
          epochs = EPOCHs, 
          validation_data=([enc_val, dec_inp_val], dec_tar_val),
          batch_size = BATCH_SIZE)
model.save_weights('./model/weights.h5', save_format = 'h5')

In [25]:
model, encoder, decoder = build_model()
model.load_weights('./model/weights.h5')
model.evaluate([enc_val, dec_inp_val], dec_tar_val)



[0.5457586050033569, 0.9363166689872742]

In [26]:
def seq2seq_inference(input_seq):
    state = encoder(input_seq)

    target_seq = np.array([[vocab['<START>']]])

    decoded_sentence = ''
    decode_count = 0
    while True:
        output_tokens, state = decoder(target_seq, state)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = inv_vocab[sampled_token_index]
        decoded_sentence += sampled_char + ' '
        decode_count += 1


        if (sampled_char == '<FINISH>' or
           decode_count > MAX_SEQ_LEN):
              break

        target_seq = np.array([[sampled_token_index]])

    return ''.join(decoded_sentence)

In [27]:
i = randint(0, 30000)
for seq_index in range(i, i+15):
    input_seq = encoder_input_seqs[seq_index: seq_index + 1]
    decoded_sentence = seq2seq_inference(input_seq)
    print('-')
    print('Input sentence:', ' '.join(result_enc_inp[seq_index]))
    print('Result sentence:', decoded_sentence)
    print('Target sentence:', ' '.join(result_dec_tar[seq_index]))


-
Input sentence: your men knew the risks .
Result sentence: what is going on here , man ? <FINISH> 
Target sentence: what is going on here , man ?
-
Input sentence: with my money !
Result sentence: we should not have done that . <FINISH> 
Target sentence: we should go burn that school to the ground , sir !
-
Input sentence: the final route for the railroad is complete .
Result sentence: call me <UNK> . <FINISH> 
Target sentence: i look forward to seeing it .
-
Input sentence: parker .
Result sentence: <UNK> . <FINISH> 
Target sentence: sir ?
-
Input sentence: sir ?
Result sentence: <UNK> ! look ! the <UNK> 's <UNK> <UNK> . he is up back to the <UNK> . <FINISH> 
Target sentence: what is that ?
-
Input sentence: what is that ?
Result sentence: no there are <UNK> to see what we are going to do better . <FINISH> 
Target sentence: what , sir ?
-
Input sentence: that .
Result sentence: you are clean . <FINISH> 
Target sentence: oh , that . i will let jenkins explain .
-
Input sentence: this