In [1]:
import numpy as np
import tensorflow as tf
import pandas
import pickle as pkl
from tensorflow.keras import layers , activations , models , preprocessing, utils
import random
random.seed(2020)
import re

In [2]:
min_line_length = 1
max_line_length = 12
HIDDEN_DIM=200

In [3]:
with open(r"data\conversations.pkl", "rb") as handle:
    conversation = pkl.load(handle)

In [4]:
#Take a random sample of conversations

sample_size = 30000

indices = random.sample(range(len(conversation)), sample_size)

conv_sample = []

for i in indices:
    conv = conversation[i]
    conv_sample.append(conv)


In [5]:
# Sort the sentences into questions (inputs) and answers (targets)
questions = []
answers = []

for conv in conv_sample:
    for i in range(len(conv)-1):
        questions.append(conv[i])
        answers.append(conv[i+1])

In [6]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = re.sub(r"  ", " ", text)
    return text

In [7]:
# Clean the data
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
    
clean_answers = []    
for answer in answers:
    clean_answers.append(clean_text(answer))

In [8]:
# Filter out the questions that are too short/long
short_questions_temp = []
short_answers_temp = []

i = 0
for question in clean_questions:
    if len(question.split()) >= min_line_length and len(question.split()) <= max_line_length:
        short_questions_temp.append(question)
        short_answers_temp.append(clean_answers[i])
    i += 1

# Filter out the answers that are too short/long
short_questions = []
short_answers = []

i = 0
for answer in short_answers_temp:
    if len(answer.split()) >= min_line_length and len(answer.split()) <= max_line_length:
        short_answers.append(answer)
        short_questions.append(short_questions_temp[i])
    i += 1

In [9]:
# Compare the number of lines we will use with the total number of lines.
print("# of questions:", len(short_questions))
print("# of answers:", len(short_answers))
print("% of data used: {}%".format(round(len(short_questions)/len(questions),4)*100))

# of questions: 41389
# of answers: 41389
% of data used: 51.980000000000004%


In [10]:
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( short_questions + short_answers )
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))

VOCAB SIZE : 19461


In [11]:
def trim_rare_words(word_counts, list_of_questions, list_of_answers, min_word_freq=1):
    """
    take a dict of word wounts, a list of questions and a list of answers 
    and return these lists without those that have rare words. 
    Optional argument sets the minimum frequency of the rare words. 
    """
    keep_words = {}
    for key, value in word_counts:
        if value > min_word_freq:
            keep_words[key] = value
    
    keep_questions_indeces = []
    for i, question in enumerate(list_of_questions):
        q_words = question.split(' ')
        if all(word in keep_words.keys() for word in q_words):
            keep_questions_indeces.append(i)
    keep_answer_indeces = []
    for i, answer in enumerate(list_of_answers):
        a_words = answer.split(' ')
        if all(word in keep_words.keys() for word in a_words):
            keep_answer_indeces.append(i)
    
    total_keep = set(keep_questions_indeces).intersection(keep_answer_indeces)
    
    keep_questions = [question for i, question in enumerate(list_of_questions) if i in total_keep ]
    keep_answers = [answer for i, answer in enumerate(list_of_answers) if i in total_keep ]
    
    
    return keep_questions, keep_answers

k_questions, k_answers = trim_rare_words(tokenizer.word_counts.items(), short_questions, short_answers, 2)  

print('final number of samples: {}'.format(len(k_questions)))

final number of samples: 27750


In [12]:
def tagger(decoder_input_sentence):
    bos = "<BOS> "
    eos = " <EOS>"
    final_target = [bos + text + eos for text in decoder_input_sentence] 
    return final_target

tagged_answers = tagger(k_answers)

In [13]:
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( k_questions + tagged_answers )
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'New VOCAB SIZE : {}'.format( VOCAB_SIZE ))

New VOCAB SIZE : 7036


In [14]:
# transform text to tokenized padded sequences, for the encoder and decoder inputs

# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences( k_questions )
maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
encoder_input_data = np.array( padded_questions )
print( encoder_input_data.shape , maxlen_questions )

# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences( tagged_answers )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
decoder_input_data = np.array( padded_answers )
print( decoder_input_data.shape , maxlen_answers )

# Do the same but with one-hot-encoding for the decoder outputs. 

# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences( tagged_answers )
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
onehot_answers = utils.to_categorical( padded_answers , VOCAB_SIZE )
decoder_output_data = np.array( onehot_answers )
print( decoder_output_data.shape )

# Saving all the arrays to storage
#np.save( 'enc_in_data.npy' , encoder_input_data )
#np.save( 'dec_in_data.npy' , decoder_input_data )
#np.save( 'dec_tar_data.npy' , decoder_output_data )

(27750, 12) 12
(27750, 14) 14
(27750, 14, 7036)


In [15]:
# The encoding layer of the model

encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 200 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

# The decoding layer, initialised with the encoder's output state
decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 200 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 200 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )

# A dense output layer, with a softmax activation
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

# Compile the model using the RMSprop Optimiser
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 200)    1407200     input_1[0][0]                    

In [16]:
%%time
batch_size = 100
epochs = 45

history = model.fit([encoder_input_data, decoder_input_data],decoder_output_data,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.2)

Train on 22200 samples, validate on 5550 samples
Epoch 1/45
Epoch 2/45
Epoch 3/45
Epoch 4/45
Epoch 5/45
Epoch 6/45
Epoch 7/45
Epoch 8/45
Epoch 9/45
Epoch 10/45
Epoch 11/45
Epoch 12/45
Epoch 13/45
Epoch 14/45
Epoch 15/45
Epoch 16/45
Epoch 17/45
Epoch 18/45
Epoch 19/45
Epoch 20/45
Epoch 21/45
Epoch 22/45
Epoch 23/45
Epoch 24/45
Epoch 25/45
Epoch 26/45
Epoch 27/45
Epoch 28/45
Epoch 29/45
Epoch 30/45
Epoch 31/45
Epoch 32/45
Epoch 33/45
Epoch 34/45
Epoch 35/45
Epoch 36/45
Epoch 37/45
Epoch 38/45
Epoch 39/45
Epoch 40/45
Epoch 41/45
Epoch 42/45
Epoch 43/45
Epoch 44/45
Epoch 45/45
Wall time: 1h 25min 7s


In [17]:
model_json = model.to_json()
with open("LARGE_s2s_model_keras.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("LARGE_s2s_model_keras_weights.h5")
print("Saved model to disk")

Saved model to disk


In [18]:
model.load_weights('LARGE_s2s_model_keras_weights.h5')

In [18]:
# In order to make inference, we need to tweek the process a bit. 
# We initialise the encoder's and decoder's states using the trained model.

def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 200 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 200 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

In [19]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')

In [20]:
enc_model , dec_model = make_inference_models()

for _ in range(10):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter question : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['bos']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'eos' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )


Enter question : hello
 hello eos
Enter question : how are you
 i am i am fine eos
Enter question : what do you like
 i am not like a friend eos
Enter question : who is your friend
 i am not gonna marry you my take a walk eos
Enter question : i will leave you alone
 when you were in eos
Enter question : just now
 i know eos
Enter question : what else do you know
 i know it is not to but eos
Enter question : but what
 i have got a walk with you eos
Enter question : when
 tomorrow night there's no no of course eos
Enter question : of course
 well it is a long time eos
