# Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import time
!pip install --upgrade tensorlayer
from tensorlayer.layers import DenseLayer, EmbeddingInputlayer, Seq2Seq, retrieve_seq_length_op2

from tqdm import tqdm
from sklearn.utils import shuffle
import tensorlayer as tl
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Collecting tensorlayer
[?25l  Downloading https://files.pythonhosted.org/packages/cf/55/2dc51f4a8e772240e63c442de06762ddefd0631399f446b6895be5e2590d/tensorlayer-1.11.1-py2.py3-none-any.whl (316kB)
[K    100% |████████████████████████████████| 317kB 21.5MB/s 
Collecting matplotlib<3.1,>=2.2 (from tensorlayer)
[?25l  Downloading https://files.pythonhosted.org/packages/71/07/16d781df15be30df4acfd536c479268f1208b2dfbc91e9ca5d92c9caf673/matplotlib-3.0.2-cp36-cp36m-manylinux1_x86_64.whl (12.9MB)
[K    100% |████████████████████████████████| 12.9MB 3.1MB/s 
Collecting scikit-image<0.15,>=0.14 (from tensorlayer)
[?25l  Downloading https://files.pythonhosted.org/packages/9c/90/553120309c53bdfca25c9c50769ae40a538a90c24db8c082468aec898d00/scikit_image-0.14.1-cp36-cp36m-manylinux1_x86_64.whl (25.3MB)
[K    100% |████████████████████████████████| 25.3MB 1.7MB/s 
Collecting requests<2.21,>=2.19 (from tensorlayer)
[?25l  Downloading https://files.pythonhosted.org/packages/ff/17/5cbb02600511530

# Load File
Loading the two necessary files for creating the chatbot.As we can see with the following print,format of the two files is:


*  movie_lines.tsv: scene_id [tab] character_id [tab] character_name [tab] text
*  movie_conversations.tsv: character_id [tab] character_id2 [tab] movie_id [tab] scenes_id



In [2]:
# Load the data
lines = open('movie_lines.tsv', encoding='utf-8', errors='ignore').read().split('\n')
conv_lines = open('movie_conversations.tsv', encoding='utf-8', errors='ignore').read().split('\n')
print (lines[:5])
print (conv_lines[:5])

['L1045\tu0\tm0\tBIANCA\tThey do not!', 'L1044\tu2\tm0\tCAMERON\tThey do to!', 'L985\tu0\tm0\tBIANCA\tI hope so.', 'L984\tu2\tm0\tCAMERON\tShe okay?', "L925\tu0\tm0\tBIANCA\tLet's go."]
["u0\tu2\tm0\t['L194' 'L195' 'L196' 'L197']", "u0\tu2\tm0\t['L198' 'L199']", "u0\tu2\tm0\t['L200' 'L201' 'L202' 'L203']", "u0\tu2\tm0\t['L204' 'L205' 'L206']", "u0\tu2\tm0\t['L207' 'L208']"]


# Functions for handling the data
Here,I keep only the information relevant to the chatbot procedure.


*   *load_lines_to_ids*:I load the lines of the conversation , keep the line id [ first element] and the text [last element]
* *convs_to_line_ids*: Into the list where the line id's are, we assign the different conversation taking place in line file
* *split_questions_answer* :Split the dialogue into questions/answers.






In [0]:
def load_lines_to_ids(lines):
    id2line = {}
    for line in lines:
        _line = line.split('\t')
        if len(_line) >= 5:
            #Remove ' and " symbol from movie id.
            lineId=_line[0].replace("'","").replace("\"","")
            id2line[lineId] = ''.join(_line[4:(len(_line))])
    return id2line

def convs_to_line_ids(conv_lines):     
    convs = [ ]
    for line in conv_lines[:-1]:
      
        #To the place where movie id's are stores ( e.g. [ 'x1' 'x2' 'x3']) replace ' and [] symbols
        _line = line.split('\t')[-1][1:-1].replace("'","").replace("[","").replace("]","")
        convs.append(_line.split(' '))
    return convs

def split_questions_answer(id2line,convs):
    q = []
    a = []
    for conv in convs:
        for i in range(len(conv)-1):
            q.append(id2line[conv[i]])
            a.append(id2line[conv[i+1]])
    return (q,a)

In [4]:
id2l=load_lines_to_ids(lines)
c=convs_to_line_ids(conv_lines)
(q,a)=split_questions_answer(id2l,c)

for i in range(0,5):
    print("-"+str(q[i]))
    print("-"+str(a[i]))
    print("-------------------------------------")

print('Length of questions='+str(len(q)))
print('Length of answers='+str(len(a)))

-Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
-Well I thought we'd start with pronunciation if that's okay with you.
-------------------------------------
-Well I thought we'd start with pronunciation if that's okay with you.
-Not the hacking and gagging and spitting part.  Please.
-------------------------------------
-Not the hacking and gagging and spitting part.  Please.
-Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?
-------------------------------------
-You're asking me out.  That's so cute. What's your name again?
-Forget it.
-------------------------------------
-No no it's my fault -- we didn't have a proper introduction ---
-Cameron.
-------------------------------------
Length of questions=221616
Length of answers=221616


# Clean text
We must clean and lowercase the dataset,before we use it. Abbreviations such as *it's* should be transformed to* it is*.

In [0]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [0]:
clean_q = []
for q_ in q:
    clean_q.append(clean_text(q_))   
clean_a = []    
for a_ in a:
    clean_a.append(clean_text(a_))

In [7]:
for i in range(0,5):
    print("-"+str(clean_q[i]))
    print("-"+str(clean_a[i]))
    print("-------------------------------------")

-can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again
-well i thought we would start with pronunciation if that is okay with you
-------------------------------------
-well i thought we would start with pronunciation if that is okay with you
-not the hacking and gagging and spitting part  please
-------------------------------------
-not the hacking and gagging and spitting part  please
-okay then how about we try out some french cuisine  saturday  night
-------------------------------------
-you are asking me out  that is so cute what is your name again
-forget it
-------------------------------------
-no no it is my fault  we did not have a proper introduction 
-cameron
-------------------------------------


# Upper and lower boundaries for sentence length.
I will do a small analysis on the length of the sentence. I would like to include sentences that will have a meaning for the whole process,thus i will see by finding the percentile values the upper and lower boundaries for my dataset. If we set the value of the length of the sentence to be from 2 words( not much meaning in sentence with one word) and 20 words,we will still include a large portion of the dataset,as it can easily be seen from the percentile values.

In [8]:
# Find the length of sentences
lengths = []
for question in clean_q:
    lengths.append(len(question.split()))
for answer in clean_a:
    lengths.append(len(answer.split()))

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])
lengths.describe()
print('0th percentile value:'+str(np.percentile(lengths, 0)))
print('25th percentile value:'+str(np.percentile(lengths, 25)))
print('50th percentile value:'+str(np.percentile(lengths, 50)))
print('80th percentile value:'+str(np.percentile(lengths, 80)))
print('85th percentile value:'+str(np.percentile(lengths, 85)))
print('90th percentile value:'+str(np.percentile(lengths, 90)))
print('95th percentile value:'+str(np.percentile(lengths, 95)))
print('99th percentile value:'+str(np.percentile(lengths, 99)))

0th percentile value:0.0
25th percentile value:4.0
50th percentile value:7.0
80th percentile value:16.0
85th percentile value:19.0
90th percentile value:24.0
95th percentile value:32.0
99th percentile value:58.0


# Preprocess
I must filter the dataset. Also, a dictionary must be created containing the words and a numerical value associated with them. Neural networks work with numbers,not strings.
Furthermore, I decided to  force the following rules:


1.  For a word to be included,it must be present at least 10 times.
2.  Sentences that the percentage of uknown words is greater than 33 percent,will not be included in the train dataset.

I also included four keywords to help me out with the process:


*   PAD:  The sentences are of a fixed size. In order to make all the sentences to be equal size,we replace the sentences with far less length than the max we decided to follow, with this keyword
* EOS: End of stream,for the answer sequence
* GO : Start of answer sequence
* UNK: Used for word in sentences not present in vocabulary.}



In [0]:
def return_valid_sentences(clean_questions,clean_answers,min_,max_):
    questions_valid_=[]
    answers_valid_ = []
    questions_valid = []
    answers_valid = []

    for i,question in enumerate(clean_questions):
        if len(question.split()) >= min_ and len(question.split()) <= max_:
            questions_valid_.append(question)
            answers_valid_.append(clean_answers[i])

    for i,answer in enumerate(answers_valid_):
        if len(answer.split()) >= min_ and len(answer.split()) <= max_:
            answers_valid.append(answer)
            questions_valid.append(questions_valid_[i])
    return (answers_valid,questions_valid)

def get_vocab(av,qv):
    vocab = {}
    for question in qv:
        for word in question.split():
            if word not in vocab:
                vocab[word] = 1
            else:
                vocab[word] += 1

    for answer in av:
        for word in answer.split():
            if word not in vocab:
                vocab[word] = 1
            else:
                vocab[word] += 1
    return vocab

def remove_infrequent_words(threshold,vocab):
    q = {}
    word_num=1
    for word, count in vocab.items():
        if count >= threshold:
            q[word] = word_num
            word_num+=1
    return q

def text_to_int(q,a,voc):
    q_int = []
    for question in q:
        ints = []
        for word in question.split():
            if word not in voc:
                ints.append(voc['<UNK>'])
            else:
                ints.append(voc[word])
        q_int.append(ints)

    a_int = []
    for answer in a:
        ints = []
        for word in answer.split():
            if word not in voc:
                ints.append(voc['<UNK>'])
            else:
                ints.append(voc[word])
        a_int.append(ints)
    return (q_int,a_int)

def remove_uninteresting(q,a,voc):
    qr=[]
    ar=[]
    art=[]
    qrt=[]

    for i,q_ in enumerate(q):
        cnt=0
        tmp=q_.split(' ')
        for x in tmp:
            if x!='' and x not in voc.keys():
                cnt+=1
        if cnt<round(len(tmp)/3):
            qrt.append(q_)
            art.append(a[i])
    for i,a_ in enumerate(art):
        cnt=0
        tmp=a_.split(' ')
        for x in tmp:
            if x!='' and x not in voc.keys():
                cnt+=1
        if cnt<round(len(tmp)/3):
            ar.append(a_)
            qr.append(qrt[i])
    return (qr,ar)
  
def pad_sentence(s,code,max_):
  max_sentence = max_
  return [sentence + [code] * (max_sentence - len(sentence)) for sentence in s]

def get_mask(target_seqs):
  masked=[]
  for s in target_seqs:
    tmp=[]
    for x in s:
      if x==0:
        tmp.append(0)
      else:
        tmp.append(1)
    masked.append(tmp)
  return masked

In [10]:
min_=2
max_=20
(a_valid,q_valid)=return_valid_sentences(clean_q,clean_a,min_,max_)
vocab=get_vocab(a_valid,q_valid)
threshold=10
word2idx=remove_infrequent_words(threshold,vocab)
print(len(q_valid))
(q_valid,a_valid)=remove_uninteresting(q_valid,a_valid,word2idx)
print(len(q_valid))
print(len(a_valid))

codes = ['<EOS>','<UNK>','<GO>']
word2idx['<PAD>']=0
for code in codes:
    word2idx[code] = len(word2idx)
    print(word2idx[code])
idx2word = {v_i: v for v, v_i in word2idx.items()}
    
(q_int,a_int)=text_to_int(q_valid,a_valid,word2idx)

print(q_int[0],'\n',q_valid[0])
print('====================================================')
print(a_int[0],'\n',a_valid[0])
input_length=len(q_int)
print(input_length,len(a_int))

138333
126329
126329
8102
8103
8104
[1, 2, 3, 4, 5, 6, 7, 8103, 8, 9, 10, 11, 7, 12] 
 well i thought we would start with pronunciation if that is okay with you
[13, 14, 8103, 15, 8103, 15, 16, 17, 18] 
 not the hacking and gagging and spitting part  please
126329 126329


In [0]:
unk_id = word2idx['<UNK>']   # 1
pad_id = word2idx['<PAD>']     # 0

start_id = word2idx['<GO>']  # 8002
end_id =  word2idx['<EOS>']  # 8003

# Model creation
Main part of the procedure.
We use the seq to seq function out of tensorflow.
Sequence to sequence is a neural network architecture. It contain two basically different neural networks, the encoder and the decoder, and its purpose is to feed the encoder with a sequence,the encoder will try to find a different represantation,which afterwards will be forwarded to the decoder,whose job is to decode the encoded sequence and produce the output.

In [0]:
def create_model(encode_seqs, decode_seqs, src_vocab_size, emb_dim, is_train=True, reuse=False):
    with tf.variable_scope("model", reuse=reuse):
        with tf.variable_scope("embedding") as vs:
            #The input layer:
            net_encode = EmbeddingInputlayer(
                inputs = encode_seqs,
                vocabulary_size = src_vocab_size,
                embedding_size = emb_dim,
                name = 'seq_embedding')
            vs.reuse_variables()
            #The output layer
            net_decode = EmbeddingInputlayer(
                inputs = decode_seqs,
                vocabulary_size = src_vocab_size,
                embedding_size = emb_dim,
                name = 'seq_embedding')
            
        net_rnn = Seq2Seq(net_encode, net_decode,
                cell_fn = tf.nn.rnn_cell.LSTMCell,
                n_hidden = emb_dim,
                initializer = tf.random_uniform_initializer(-0.1, 0.1),
                encode_sequence_length = retrieve_seq_length_op2(encode_seqs),
                decode_sequence_length = retrieve_seq_length_op2(decode_seqs),
                initial_state_encode = None,
                dropout = (0.5 if is_train else None),
                n_layer = 3,
                return_seq_2d = True,
                name = 'seq2seq')

        net_out = DenseLayer(net_rnn, n_units=src_vocab_size, act=tf.identity, name='output')
    return net_out, net_rnn

Reset the graph in case something from a previous execution remains. Also,initialize variables for configuration.

In [0]:

tf.reset_default_graph()
sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
sess_config.gpu_options.allow_growth = True

 
Here I set the placeholders for input and output.We also want to have a third placeholder,the mask.I use the cross entropy loss function ( cross-entropy describes the loss between two probability distributions and measures how close is the predicted distribution to the true distribution), alongside mask.Mask is  a vector containing 1's where the sentence have a word and 0's when it has PAD item.The mask is going to be used to skip any input with mask 0 by copying the previous hidden state of the cell; it will proceed normally for any input with mask 1.Masking allows us to handle various length inputs in RNNs.

In [14]:
batch_size=128
n_step = len(q_int) // batch_size
with tf.device('/device:GPU:0'):
  encode_seqs_= tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="encode_seqs")
  decode_seqs_ = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="decode_seqs")
  target_seqs_ = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_seqs")
  target_mask_ = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_mask") 
  net_out, _ = create_model(encode_seqs_, decode_seqs_, len(word2idx), 512, is_train=True, reuse=False)
  net_out.print_params(False)

  # Inference Data Placeholders
  encode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="encode_seqs")
  decode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="decode_seqs")

  net, net_rnn = create_model(encode_seqs2, decode_seqs2, len(word2idx), 512, is_train=False, reuse=True)
  y = tf.nn.softmax(net.outputs)

  # Loss Function
  loss = tl.cost.cross_entropy_seq_with_mask(logits=net_out.outputs, target_seqs=target_seqs_, 
                                              input_mask=target_mask_, return_details=False, name='cost')

  # Optimizer
  optimizer=tf.train.AdamOptimizer(learning_rate=0.001)
  train_op = optimizer.minimize(loss)
  

[TL] EmbeddingInputlayer model/embedding/seq_embedding: (8105, 512)
[TL] EmbeddingInputlayer model/embedding/seq_embedding: (8105, 512)
[TL] [*] Seq2Seq model/seq2seq: n_hidden: 512 cell_fn: LSTMCell dropout: 0.5 n_layer: 3
[TL] DynamicRNNLayer model/seq2seq/encode: n_hidden: 512, in_dim: 3 in_shape: (128, ?, 512) cell_fn: LSTMCell dropout: 0.5 n_layer: 3
[TL]        batch_size (concurrent processes): 128
[TL] DynamicRNNLayer model/seq2seq/decode: n_hidden: 512, in_dim: 3 in_shape: (128, ?, 512) cell_fn: LSTMCell dropout: 0.5 n_layer: 3
[TL]        batch_size (concurrent processes): 128
[TL] DenseLayer  model/output: 8105 No Activation
[TL]   param   0: model/embedding/seq_embedding/embeddings:0 (8105, 512)        float32_ref
[TL]   param   1: model/seq2seq/encode/rnn/multi_rnn_cell/cell_0/lstm_cell/kernel:0 (1024, 2048)       float32_ref
[TL]   param   2: model/seq2seq/encode/rnn/multi_rnn_cell/cell_0/lstm_cell/bias:0 (2048,)            float32_ref
[TL]   param   3: model/seq2seq/enco

Here we initialize the variables and initialize a session with the beforementioned configuration.

In [16]:
a=tf.initialize_all_variables()
b=tf.global_variables_initializer()
sess = tf.Session(config=sess_config)

# Actually intialize the variables

Instructions for updating:
Use `tf.global_variables_initializer` instead.


Inference is the answer to a question made by the user.What it does is it gets the question string and inserts it into the neural network designed specifically for user questions. It uses the trained model in the following step to generate the most appropriate sequence,based on the knowledge gained from the training procedure.

In [0]:
def inference(seed):
  seed=clean_text(seed)
  seed_id = [word2idx.get(w, unk_id) for w in seed.split(" ")]
  
  state = sess.run(net_rnn.final_state_encode,
                  {encode_seqs2: [seed_id]})
  #Run the softmax function to the output of the decoded rnn used for inference
  o, state = sess.run([y, net_rnn.final_state_decode],
                  {net_rnn.initial_state_decode: state,
                  decode_seqs2: [[start_id]]})
  w_id = tl.nlp.sample_top(o[0], top_k=10)
  w = idx2word[w_id]
  # Decode and feed state iteratively
  sentence = [w]
  #As far as the output symbol is not EOS,keep feeding.
  for _ in range(30): # max sentence length
      o, state = sess.run([y, net_rnn.final_state_decode],
                      {net_rnn.initial_state_decode: state,
                      decode_seqs2: [[w_id]]})
      w_id = tl.nlp.sample_top(o[0], top_k=10)
      w = idx2word[w_id]
      if w_id == end_id:
          break
      sentence = sentence + [w]
  return sentence
      

# Training 

The main training phase. The iteration iterates up to number of epochs defined below, and splits dataset to trainX and trainY.Before we split the dataset,we shuffle the data to assure at most that batches will contain data with multiple distributions and the neural network will be trained with as many different data samples as possible.

In [0]:
seeds = ["do you love trump","how are you","i love you","fuck you","do you like drinking","what is your name"]
num_epochs=25
sess.run(a)
sess.run(b)
import sys
sys.stdout.flush()
for epoch in range(num_epochs):
    trainX, trainY = shuffle(q_int, a_int, random_state=0)
    total_loss, n_iter = 0, 0
    for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=False), 
                    total=n_step, desc='Epoch[{}/{}]'.format(epoch + 1, num_epochs), leave=False):

        X = pad_sentence(X,word2idx['<PAD>'],max_)
        _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
        _target_seqs = pad_sentence(_target_seqs,word2idx['<PAD>'],max_+1)
        _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
        _decode_seqs = pad_sentence(_decode_seqs,word2idx['<PAD>'],max_+1)
        _target_mask = get_mask(_target_seqs)
        
        _, loss_iter = sess.run([train_op, loss], {encode_seqs_: X, decode_seqs_: _decode_seqs,
                        target_seqs_: _target_seqs, target_mask_: _target_mask})
        total_loss += loss_iter
        n_iter += 1

    # printing average loss after every epoch
    print('Epoch [{}/{}]: loss {:.4f} learning rate={:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter,optimizer._lr))

    # inference after every epoch
    for seed in seeds:
        print("Query >", seed)
        for _ in range(5):
            sentence = inference(seed)
            print(" >", ' '.join(sentence))

    # saving the model
    tl.files.save_npz(net.all_params, name='model.npz', sess=sess)
from google.colab import files
files.download('model.npz')
# session cleanup
sess.close()

After the lengthy training procedure,we save and keep the model to use it again without having to train the model again.

In [23]:
tl.files.load_and_assign_npz(sess=sess, name='model.npz', network=net)

[TL] [*] Load model.npz SUCCESS!


<tensorlayer.layers.dense.base_dense.DenseLayer at 0x7fc7e857ae10>

Interface to communicate with user.q exits the execution.

In [26]:
s=''
print('q to exit')
while s!='q':
  s=input('-')
  if s!='q':
    m=inference(s)
    print("-",' '.join(m))
print("Adios!")

q to exit
-my heart hurts
- i am sorry
-i feel negligible
- and the next time you are going to get a lot more
-i feel empty
- well what happened
-i lost my mother
- what is it
-the person who gave birth to me
- you will have to <UNK> it up
-suck ?
- i will be back soon
-why you always leving
- i am an actor
-what do you do
- i am going to kill her
-Who?
- your wife she is a <UNK>
-What the hell
- what are you talking about
-Robots really tok over
- and what are you thinking about
-stopping you all
- and you are the one who wants to be a fireman
-i never said that
- i have heard what i wanted


KeyboardInterrupt: ignored