In [1]:
from __future__ import print_function

import sys
import os
import pandas as pd
import numpy as np
import re     # stand for Regular expression operations
import nltk   # Natural Language Toolkit

from keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense, Bidirectional
from keras.models import Model, load_model
from keras.layers import Activation, dot, concatenate

INPUT_LENGTH = 20
OUTPUT_LENGTH = 20




In [10]:
# Load the data
lines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conv_lines = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [11]:
# Create a dictionary to map each line's id with its text
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        id2line[_line[0]] = _line[4]

In [12]:
# Create a list of all of the conversations' lines' ids.
convs = []
for line in conv_lines[:-1]:
    _line = line.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    convs.append(_line.split(','))

In [13]:
#id and conversation sample
for k in convs[200]:
    print (k, id2line[k])

L929 What just happened?
L930 Your daughters went to the prom.
L931 Did I have anything to say about it?
L932 Absolutely not.
L933 That ' s what I thought


In [14]:
# Sort the sentences into questions (inputs) and answers (targets)
questions = []
answers = []
for conv in convs:
    for i in range(len(conv)-1):
        questions.append(id2line[conv[i]])
        answers.append(id2line[conv[i+1]])
        
# Compare lengths of questions and answers
print(len(questions))
print(len(answers))

221616
221616


In [15]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|]", "", text)
#     text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = " ".join(text.split())
    return text

In [16]:
# Clean the data
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
clean_answers = []    
for answer in answers:
    clean_answers.append(clean_text(answer))

In [17]:
# Find the length of sentences (not using nltk due to processing speed)
lengths = []
# lengths.append([len(nltk.word_tokenize(sent)) for sent in clean_questions]) #nltk approach
for question in clean_questions:
    lengths.append(len(question.split()))
for answer in clean_answers:
    lengths.append(len(answer.split()))
# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])
print(np.percentile(lengths, 80))
print(np.percentile(lengths, 85))
print(np.percentile(lengths, 90))
print(np.percentile(lengths, 95))

16.0
19.0
24.0
32.0


In [18]:
# Remove questions and answers that are shorter than 1 word and longer than 20 words.
min_line_length = 2
max_line_length = 20

# Filter out the questions that are too short/long
short_questions_temp = []
short_answers_temp = []

for i, question in enumerate(clean_questions):
    if len(question.split()) >= min_line_length and len(question.split()) <= max_line_length:
        short_questions_temp.append(question)
        short_answers_temp.append(clean_answers[i])

# Filter out the answers that are too short/long
short_questions = []
short_answers = []

for i, answer in enumerate(short_answers_temp):
    if len(answer.split()) >= min_line_length and len(answer.split()) <= max_line_length:
        short_answers.append(answer)
        short_questions.append(short_questions_temp[i])
        
print(len(short_questions))
print(len(short_answers))

138528
138528


In [19]:
r = np.random.randint(1,len(short_questions))

for i in range(r, r+3):
    print(short_questions[i])
    print(short_answers[i])
    print()

well, i, uh, i will be leaving town for a little while.
for how long?

for how long?
oh, i do not know.

oh, i do not know.
a week? a year?



In [20]:
#choosing number of samples
num_samples = 90000  # Number of samples to train on.
short_questions = short_questions[:num_samples]
short_answers = short_answers[:num_samples]
#tokenizing the qns and answers
short_questions_tok = [nltk.word_tokenize(sent) for sent in short_questions]
short_answers_tok = [nltk.word_tokenize(sent) for sent in short_answers]

In [21]:
#train-validation split
data_size = len(short_questions_tok)

# We will use the first 0-80th %-tile (80%) of data for the training
training_input  = short_questions_tok[:round(data_size*(80/100))]
training_input  = [tr_input[::-1] for tr_input in training_input] #reverseing input seq for better performance
training_output = short_answers_tok[:round(data_size*(80/100))]

# We will use the remaining for validation
validation_input = short_questions_tok[round(data_size*(80/100)):]
validation_input  = [val_input[::-1] for val_input in validation_input] #reverseing input seq for better performance
validation_output = short_answers_tok[round(data_size*(80/100)):]

print('training size', len(training_input))
print('validation size', len(validation_input))

training size 72000
validation size 18000


# Word encoding decoding dictionaries

In [22]:
# Create a dictionary for the frequency of the vocabulary 
vocab = {}
for question in short_questions_tok:
    for word in question:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1

for answer in short_answers_tok:
    for word in answer:
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1     

## Remove the rare word from the vocabulary

In [23]:
# Remove rare words from the vocabulary.
# We will aim to replace fewer than 5% of words with <UNK>
# You will see this ratio soon.
threshold = 15
count = 0
for k,v in vocab.items():
    if v >= threshold:
        count += 1

In [24]:
print("Size of total vocab:", len(vocab))
print("Size of vocab we will use:", count)

Size of total vocab: 31650
Size of vocab we will use: 4387


In [25]:
#we will create dictionaries to provide a unique integer for each word.
WORD_CODE_START = 1
WORD_CODE_PADDING = 0


word_num  = 2 #number 1 is left for WORD_CODE_START for model decoder later
encoding = {}
decoding = {1: 'START'}
for word, count in vocab.items():
    if count >= threshold: #get vocabularies that appear above threshold count
        encoding[word] = word_num 
        decoding[word_num ] = word
        word_num += 1

print("No. of vocab used:", word_num)

No. of vocab used: 4389


In [26]:
#include unknown token for words not in dictionary
decoding[len(encoding)+2] = '<UNK>'
encoding['<UNK>'] = len(encoding)+2

In [27]:
dict_size = word_num+1
dict_size

4390

# Vectorizing dataset

In [28]:
def transform(encoding, data, vector_size=20):
    """
    :param encoding: encoding dict built by build_word_encoding()
    :param data: list of strings
    :param vector_size: size of each encoded vector
    """
    transformed_data = np.zeros(shape=(len(data), vector_size))
    for i in range(len(data)):
        for j in range(min(len(data[i]), vector_size)):
            try:
                transformed_data[i][j] = encoding[data[i][j]]
            except:
                transformed_data[i][j] = encoding['<UNK>']
    return transformed_data

In [29]:
#encoding training set
encoded_training_input = transform(
    encoding, training_input, vector_size=INPUT_LENGTH)
encoded_training_output = transform(
    encoding, training_output, vector_size=OUTPUT_LENGTH)

print('encoded_training_input', encoded_training_input.shape)
print('encoded_training_output', encoded_training_output.shape)

encoded_training_input (72000, 20)
encoded_training_output (72000, 20)


In [30]:
#encoding validation set
encoded_validation_input = transform(
    encoding, validation_input, vector_size=INPUT_LENGTH)
encoded_validation_output = transform(
    encoding, validation_output, vector_size=OUTPUT_LENGTH)

print('encoded_validation_input', encoded_validation_input.shape)
print('encoded_validation_output', encoded_validation_output.shape)

encoded_validation_input (18000, 20)
encoded_validation_output (18000, 20)


# Model

In [31]:
import tensorflow as tf
tf.keras.backend.clear_session()
tf.compat.v1.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [32]:
INPUT_LENGTH = 20
OUTPUT_LENGTH = 20

encoder_input = Input(shape=(INPUT_LENGTH,))
decoder_input = Input(shape=(OUTPUT_LENGTH,))

In [33]:
from keras.layers import SimpleRNN
path_checkpoint = "model_checkpoint1.h5"
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4)
modelckpt_callback = tf.keras.callbacks.ModelCheckpoint(monitor="val_loss",filepath=path_checkpoint, save_weights_only=True, save_best_only=True, )
encoder = Embedding(dict_size, 128, input_length=INPUT_LENGTH, mask_zero=True)(encoder_input)
encoder = LSTM(256, return_sequences=True, unroll=True)(encoder)
encoder_last = encoder[:,-1,:]

print('encoder', encoder)
print('encoder_last', encoder_last)

decoder = Embedding(dict_size, 128, input_length=OUTPUT_LENGTH, mask_zero=True)(decoder_input)
decoder = LSTM(256, return_sequences=True, unroll=True)(decoder, initial_state=[encoder_last, encoder_last])


print('decoder', decoder)

# For the plain Sequence-to-Sequence, we produced the output from directly from decoder
# output = TimeDistributed(Dense(output_dict_size, activation="softmax"))(decoder)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
encoder Tensor("lstm/transpose_2:0", shape=(?, 20, 256), dtype=float32)
encoder_last Tensor("strided_slice:0", shape=(?, 256), dtype=float32)
decoder Tensor("lstm_1/transpose_2:0", shape=(?, 20, 256), dtype=float32)


# Attention Mechanism

In [34]:
attention = dot([decoder, encoder], axes=[2, 2])
attention = Activation('softmax', name='attention')(attention)
print('attention', attention)

context = dot([attention, encoder], axes=[2,1])
print('context', context)

decoder_combined_context = concatenate([context, decoder])
print('decoder_combined_context', decoder_combined_context)

# Has another weight + tanh layer as described in equation (5) of the paper
output = TimeDistributed(Dense(256, activation="tanh"))(decoder_combined_context)
output = TimeDistributed(Dense(dict_size, activation="softmax"))(output)
print('output', output)

attention Tensor("attention/Softmax:0", shape=(?, 20, 20), dtype=float32)
context Tensor("dot_1/MatMul:0", shape=(?, 20, 256), dtype=float32)
decoder_combined_context Tensor("concatenate/concat:0", shape=(?, 20, 512), dtype=float32)
output Tensor("time_distributed_1/Reshape_1:0", shape=(?, 20, 4390), dtype=float32)


In [35]:
model = Model(inputs=[encoder_input, decoder_input], outputs=[output])
model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 20, 128)      561920      ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 20, 256)      394240      ['embedding[0][0]']              
                                                                                              

In [36]:
training_encoder_input = encoded_training_input
training_decoder_input = np.zeros_like(encoded_training_output)
training_decoder_input[:, 1:] = encoded_training_output[:,:-1]
training_decoder_input[:, 0] = WORD_CODE_START
training_decoder_output = np.eye(dict_size)[encoded_training_output.astype('int')]

validation_encoder_input = encoded_validation_input
validation_decoder_input = np.zeros_like(encoded_validation_output)
validation_decoder_input[:, 1:] = encoded_validation_output[:,:-1]
validation_decoder_input[:, 0] = WORD_CODE_START
validation_decoder_output = np.eye(dict_size)[encoded_validation_output.astype('int')]

In [37]:
model.fit(x=[training_encoder_input, training_decoder_input], y=[training_decoder_output],
          validation_data=([validation_encoder_input, validation_decoder_input], [validation_decoder_output]),
          #validation_split=0.05,
          batch_size=128, epochs=100, callbacks=[callback, modelckpt_callback])

model.save('model_attention.h5')

Train on 72000 samples, validate on 18000 samples
Epoch 1/100

  updates = self.state_updates


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100


# Model Testing

In [39]:
def prediction(raw_input):
    clean_input = clean_text(raw_input)
    input_tok = [nltk.word_tokenize(clean_input)]
    input_tok = [input_tok[0][::-1]]  #reverseing input seq
    encoder_input = transform(encoding, input_tok, 20)
    decoder_input = np.zeros(shape=(len(encoder_input), OUTPUT_LENGTH))
    decoder_input[:,0] = WORD_CODE_START
    for i in range(1, OUTPUT_LENGTH):
        output = model.predict([encoder_input, decoder_input]).argmax(axis=2)
        decoder_input[:,i] = output[:,i]
    return output

def decode(decoding, vector):
    """
    :param decoding: decoding dict built by word encoding
    :param vector: an encoded vector
    """
    text = ''
    for i in vector:
        if i == 0:
            break
        text += ' '
        text += decoding[i]
    return text

# Examples

## Example 1 with random

In [40]:
for i in range(20):
    seq_index = np.random.randint(1, len(short_questions))
    output = prediction(short_questions[seq_index])
    print ('Q:', short_questions[seq_index])
    print ('A:', decode(decoding, output[0]))

  updates=self.state_updates,


Q: then it is god, right?
A:  i am not sure .
Q: rick, i really think i am in love.
A:  i am sorry .
Q: you sound like the media.
A:  i am not sure .
Q: do not judge me. you are a joke, coming here from a fuckedup culture, telling us what to do!
A:  i am sorry , i am not <UNK> !
Q: the new place? new.
A:  i do not know .
Q: yes. now.
A:  i am sorry .
Q: i do not see
A:  i am not going to be a <UNK> .
Q: it is the only name i have got. how about you?
A:  i do not know .
Q: in a dream.
A:  i am sorry .
Q: none at this time, sir.
A:  i am sorry .
Q: then tomorrow's grand jury indictments will just be bullshit.
A:  i am sorry , i am not <UNK> .
Q: what do you think, honey?
A:  i am not sure .
Q: it might be paranoia, but i have never lost my keycard before.
A:  i am sorry , i am not <UNK> .
Q: i read george jean nathan every week.
A:  i am sorry .
Q: sir, yes, sir!
A:  i am sorry , i am not <UNK> !
Q: i just cannot leave without first searching those islands, one by one.
A:  i am sorry , i

## Example 2 manual input

In [41]:
raw_input = input()
output = prediction(raw_input)
print (decode(decoding, output[0]))

 how are you?


 i am not sure .


## Example 3 with random 

In [45]:
for i in range(20):
    seq_index = np.random.randint(1, len(short_questions))
    output = prediction(short_questions[seq_index])
    print ('Q:', short_questions[seq_index])
    print ('A:', decode(decoding, output[0]))

Q: is he really not here?
A:  no rod never me date escort you man . i pretty no stuff is ! kinda .
Q: stop it. this is your pain your burning hand. it is right here. look at it.
A:  head is push , and i though you .
Q: you think that section on the point is ridable, lance?
A:  wanted damage find rather exactly know simply .
Q: that was fun. i do not think california is bad at all. it is a drag coming home.
A:  i listen reputation find cindy .
Q: yeah, sure. how much are they?
A:  wanted damage find disappeared goes push with following reputation that cuba .
Q: that is a great handle.
A:  i dry news .
Q: just had one.
A:  i listen reputation find issue .
Q: you want your surprise?
A:  and longer ... new you , say for doing listening land .
Q: she is quite pretty.
A:  i actually .
Q: would you? i am not so sure.
A:  we dating cry thirsty get results .
Q: hell, now's the time to buy it off him cheap.
A:  i listen reputation find cindy .
Q: you are the one that put us together. we are frie

## Example 4 with random

In [51]:
for i in range(20):
    seq_index = np.random.randint(1, len(short_questions))
    output = prediction(short_questions[seq_index])
    print ('Q:', short_questions[seq_index])
    print ('A:', decode(decoding, output[0]))

Q: okay, so...
A:  i am not going to <UNK> you .
Q: yeah, but what are you doing out here?
A:  i am not sure .
Q: in a manner of speaking. i have never encountered the olfactory hallucinations, but i am sure they are related.
A:  i am sorry , i am not <UNK> .
Q: let me see the sick.
A:  i am sorry .
Q: give him the works.
A:  i do not know .
Q: is anybody besides you here now?
A:  i am not sure .
Q: who is he?
A:  he is a <UNK> .
Q: wait a minute, sonny. i think you better come with me.
A:  i am not going to <UNK> you .
Q: do you know what ed gein said about women?
A:  i do not know .
Q: you sure?
A:  i am not sure .
Q: not much danger here, ma'am, i would not think right here in the heart of edinburgh.
A:  i am sorry , i am not <UNK> .
Q: how did you know he was a construction worker?
A:  he is a <UNK> .
Q: because i did only sleep with three guys! that does not mean i did not just go with people.
A:  i am not sure .
Q: your daughter's the only leverage they have to keep you quiet.
A: