# Imports

In [1]:
from unicodedata import normalize
from pprint import pprint
import string
import re
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, CuDNNLSTM, Input, Embedding, TimeDistributed, Flatten, Dropout
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

Using TensorFlow backend.


# Reading movie lines

In [2]:
table = str.maketrans('', '', string.punctuation)
# prepare regex for char filtering
re_print = re.compile('[^%s]' % re.escape(string.printable))
# prepare translation table for removing punctuation
table = str.maketrans('', '', string.punctuation)

def clean_sentence(line):
    line = line.strip().replace('--', '').replace("  ", " ").replace('"', "")
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    # tokenize on white space
    line = line.split()
    # convert to lowercase
    line = [word.lower() for word in line]
    # remove punctuation from each token
    line = [word.translate(table) for word in line]
    # remove non-printable chars form each token
    line = [re_print.sub('', w) for w in line]
    # remove tokens with numbers in them
    line = [word for word in line if word.isalpha()]
    return ' '.join(line)

with open('./cornell-movie-dialogs-corpus/movie_lines.txt', 'r', errors='ignore') as f:
    lines_as_list = [row.strip() for row in f.readlines()]


lines = {}
for line in lines_as_list:
    lines[
        line.split('+++$+++')[0].strip()
    ] = clean_sentence(line.split('+++$+++')[-1])  # clean sentences

del lines_as_list

with open('./cornell-movie-dialogs-corpus/movie_conversations.txt', 'r', errors='ignore') as f:
    conversations = [row.strip() for row in f.readlines()]

# only take id's and convert list as string to list as list
conversations = [
    conversation.split('+++$+++')[-1].strip().replace('[', '').replace(']', '').replace("'", '').replace(" ", '').split(',') 
    for conversation in conversations
]

pprint({k: lines[k] for k in list(lines)[:10]})
print()
pprint(conversations[:10])

assert len([conversation for conversation in conversations if len(conversation) <=1]) == 0


{'L1044': 'they do to',
 'L1045': 'they do not',
 'L869': 'like my fear of wearing pastels',
 'L870': 'im kidding you know how sometimes you just become this persona and '
         'you dont know how to quit',
 'L871': 'no',
 'L872': 'okay youre gonna need to learn how to lie',
 'L924': 'wow',
 'L925': 'lets go',
 'L984': 'she okay',
 'L985': 'i hope so'}

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366']]


# map keys to line

In [3]:
conversations_with_lines = []
for conversation in conversations:
    conversations_with_lines.append([lines[key] for key in conversation])
    
pprint(conversations_with_lines[100:110])

[['yeah', 'what do you think'],
 ['two legs nice rack',
  'yeah whatever i want you to go out with her',
  'sure sparky ill get right on it',
  'you just said',
  'you need money to take a girl out',
  'but youd go out with her if you had the cake'],
 ['you got it verona i pick up the tab you do the honors',
  'youre gonna pay me to take out some girl',
  'i cant date her sister until that one gets a boyfriend and thats the catch '
  'she doesnt want a boyfriend',
  'how much'],
 ['i cant take a girl like that out on twenty bucks', 'fine thirty'],
 ['take it or leave it this isnt a negotiation',
  'fifty and youve got your man'],
 ['when i shell out fifty i expect results',
  'im on it',
  'watching the bitch trash my car doesnt count as a date',
  'i got her under control she just acts crazed in public to keep up the '
  'image'],
 ['i just upped my price',
  'what',
  'a hundred bucks a date',
  'forget it',
  'forget her sister then'],
 ['its about time', 'a deals a deal'],
 ['howd 

# Pair those things

In [4]:
def pair_it(my_list):
    pairs = []
    for i in range(len(my_list) -1):
        pairs.append([my_list[i], my_list[i + 1]])
    return pairs

paired_conversations_agg = [
    pair_it(conversation) for conversation in conversations_with_lines
]
conversations_pairs = np.array([item for sublist in paired_conversations_agg for item in sublist])
for i in range(10):
    pprint(conversations_pairs[i])

array(['can we make this quick roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad again',
       'well i thought wed start with pronunciation if thats okay with you'],
      dtype='<U2857')
array(['well i thought wed start with pronunciation if thats okay with you',
       'not the hacking and gagging and spitting part please'],
      dtype='<U2857')
array(['not the hacking and gagging and spitting part please',
       'okay then how bout we try out some french cuisine saturday night'],
      dtype='<U2857')
array(['youre asking me out thats so cute whats your name again',
       'forget it'], dtype='<U2857')
array(['no no its my fault we didnt have a proper introduction',
       'cameron'], dtype='<U2857')
array(['cameron',
       'the thing is cameron im at the mercy of a particularly hideous breed of loser my sister i cant date until she does'],
      dtype='<U2857')
array(['the thing is cameron im at the mercy of a particularly hideou

# Tokenize

In [5]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

# prepare left tokenizer
left_tokenizer = create_tokenizer(conversations_pairs[:, 0])
left_vocab_size = len(left_tokenizer.word_index) + 1
left_length = max_length(conversations_pairs[:, 0])

print('left Vocabulary Size: %d' % left_vocab_size)
print('left Max Length: %d' % (left_length))
# prepare german tokenizer
right_tokenizer = create_tokenizer(conversations_pairs[:, 1])
right_vocab_size = len(right_tokenizer.word_index) + 1
right_length = max_length(conversations_pairs[:, 1])
print('right Vocabulary Size: %d' % right_vocab_size)
print('right Max Length: %d' % (right_length))



left Vocabulary Size: 53802
left Max Length: 313
right Vocabulary Size: 54617
right Max Length: 552


# Split

In [6]:
print("split index {}".format(len(conversations_pairs)))
split_index = int(len(conversations_pairs) * .8)
train, test = conversations_pairs[:split_index], conversations_pairs[split_index:]

split index 221616


# Encode

In [None]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

# prepare training data
trainX = encode_sequences(right_tokenizer, right_length, train[:, 1])
trainY = encode_sequences(left_tokenizer, left_length, train[:, 0])
trainY = encode_output(trainY, left_vocab_size)
# prepare validation data
testX = encode_sequences(right_tokenizer, right_length, test[:, 1])
testY = encode_sequences(left_tokenizer, left_length, test[:, 0])
testY = encode_output(testY, left_vocab_size)

# Define Model

In [None]:
n_units=256
model = Sequential()
model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
model.add(LSTM(n_units))
model.add(RepeatVector(tar_timesteps))
model.add(LSTM(n_units, return_sequences=True))
model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))

model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())

# Training

In [None]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)