# Imports

In [None]:
import numpy as np
from unicodedata import normalize
from pprint import pprint
import string
import re
import matplotlib.pyplot as plt
from keras.backend import clear_session
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, CuDNNLSTM, Input, Embedding, TimeDistributed, Flatten, Dropout, RepeatVector
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook

# Reading movie lines

In [None]:
table = str.maketrans('', '', string.punctuation)
# prepare regex for char filtering
re_print = re.compile('[^%s]' % re.escape(string.printable))
# prepare translation table for removing punctuation
table = str.maketrans('', '', string.punctuation)

def clean_sentence(line):
    line = line.strip().replace('--', '').replace("  ", " ").replace('"', "")
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    # tokenize on white space
    line = line.split()
    # convert to lowercase
    line = [word.lower() for word in line]
    # remove punctuation from each token
    line = [word.translate(table) for word in line]
    # remove non-printable chars form each token
    line = [re_print.sub('', w) for w in line]
    # remove tokens with numbers in them
    line = [word for word in line if word.isalpha()]
    return ' '.join(line)

with open('./cornell-movie-dialogs-corpus/movie_lines.txt', 'r', errors='ignore') as f:
    lines_as_list = [row.strip() for row in f.readlines()]


lines = {}
for line in lines_as_list:
    lines[
        line.split('+++$+++')[0].strip()
    ] = clean_sentence(line.split('+++$+++')[-1])  # clean sentences

del lines_as_list

with open('./cornell-movie-dialogs-corpus/movie_conversations.txt', 'r', errors='ignore') as f:
    conversations = [row.strip() for row in f.readlines()]

# only take id's and convert list as string to list as list
conversations = [
    conversation.split('+++$+++')[-1].strip().replace('[', '').replace(']', '').replace("'", '').replace(" ", '').split(',') 
    for conversation in conversations
]

pprint({k: lines[k] for k in list(lines)[:10]})
print()
pprint(conversations[:10])

assert len([conversation for conversation in conversations if len(conversation) <=1]) == 0


# map keys to line

In [None]:
conversations_with_lines = []
for conversation in conversations:
    conversations_with_lines.append([lines[key] for key in conversation])
    
pprint(conversations_with_lines[100:110])

# Pair those things

In [None]:
def pair_it(my_list):
    pairs = []
    for i in range(len(my_list) -1):
        pairs.append([my_list[i], my_list[i + 1]])
    return pairs

paired_conversations_agg = [
    pair_it(conversation) for conversation in conversations_with_lines
]
conversations_pairs = np.array([item for sublist in paired_conversations_agg for item in sublist])
for i in range(10):
    pprint(conversations_pairs[i])

# Noise reduction

In [None]:
output_notebook()
hist, edges = np.histogram([len(question) + len(answer) for question, answer in conversations_pairs], density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Conversation length distribution")

p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)
longest_converastion = conversations_pairs[np.array([len(question) + len(answer) for question, answer in conversations_pairs]).argmax()]
print("longest conversation: \n{}".format(longest_converastion))
print("\nlongest conversation lenght: {}".format(len(longest_converastion[0]) + len(longest_converastion[1])))
max_conversation_lenght = 500  # maximum alowed converastion lenght in characters
# clensed_conversations = np.where((len(conversations_pairs[:,0]) + len(conversations_pairs[:,1])) < max_conversation_lenght)
clensed_conversations = np.array([conversation_pair for conversation_pair in conversations_pairs if len(conversation_pair[0]) + len(conversation_pair[1]) < max_conversation_lenght])
print("filetered {} conversations".format(len(conversations_pairs) - len(clensed_conversations)))

# Shity magic

In [None]:
input_characters = set()
target_characters = set()

for input_text, target_text in clensed_conversations:
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)
            
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_input_tokens = len(input_characters)
num_target_tokens = len(target_characters)
max_input_seq_length = max([len(input_text) for input_text, target_text in clensed_conversations])
max_target_seq_length = max([len(target_text) for input_text, target_text in clensed_conversations])
max_sentence_lenght = max(max_input_seq_length, max_target_seq_length)

print('Number of unique input tokens:', num_input_tokens)
print('Number of unique output tokens:', num_target_tokens)
print('Max sequence length for inputs:', max_input_seq_length)
print('Max sequence length for outputs:', max_target_seq_length)

target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

split_index = int(len(clensed_conversations) * .8)
pre_trainX = clensed_conversations[:split_index, 0]
pre_trainY = clensed_conversations[:split_index, 1]
pre_testX = clensed_conversations[split_index:, 0]
pre_testY = clensed_conversations[split_index:, 1]

trainX = np.zeros((len(pre_trainX), max_sentence_lenght), dtype='float32')
trainY = np.zeros((len(pre_trainY), max_sentence_lenght, num_target_tokens), dtype='float32')
testX = np.zeros((len(pre_testX), max_sentence_lenght), dtype='float32')
testY = np.zeros((len(pre_testY), max_sentence_lenght, num_target_tokens), dtype='float32')

print("trainX {} | trainY {} | testX {} | testY {}".format(trainX.shape, trainY.shape, testX.shape, testY.shape))

for i, (input_text) in enumerate(pre_trainX):
    for t, char in enumerate(input_text):
        trainX[i, t] = target_token_index[char]

for i, (target_text) in enumerate(pre_trainY):
    for t, char in enumerate(target_text):
        trainY[i, t, target_token_index[char]] = 1.

for i, (input_text) in enumerate(pre_testX):
    for t, char in enumerate(input_text):
        testX[i, t] = target_token_index[char]

for i, (target_text) in enumerate(pre_testY):
    for t, char in enumerate(target_text):
        testY[i, t, target_token_index[char]] = 1.

# Define Model

In [None]:
clear_session()
n_units=256
model = Sequential()
model.add(Embedding(num_input_tokens, n_units, input_length=max_sentence_lenght, mask_zero=True))
model.add(LSTM(n_units))  # CuDNNLSTM
model.add(RepeatVector(max_sentence_lenght))
model.add(LSTM(n_units, return_sequences=True))  # CuDNNLSTM
model.add(TimeDistributed(Dense(num_target_tokens, activation='softmax')))

model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
model.summary()

# Training

In [None]:
# fit model
filename = 'mount-this/model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)