In [1]:
# Start by importing all the things we'll need.
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, CuDNNLSTM, Flatten, TimeDistributed, Dropout, LSTMCell, RNN
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K

# This enables the Jupyter backend on some matplotlib installations.
%matplotlib notebook
import matplotlib.pyplot as plt
# Turn off interactive plots. iplt doesn't work well with Jupyter.
plt.ioff()

import unicodedata
import pymongo
import re
import numpy as np
import os
import time
import shutil
import math
import threading

In [128]:
BATCH_SIZE = 64
embedding_dim = 256
units = 512
len_input = 300
len_output = 30
vocab_in_size = 2475
vocab_out_size = 2475

In [3]:
# RNN "Cell" classes in Keras perform the actual data transformations at each timestep. Therefore, in order
# to add attention to LSTM, we need to make a custom subclass of LSTMCell.
class AttentionLSTMCell(LSTMCell):
    def __init__(self, **kwargs):
        self.attentionMode = False
        super(AttentionLSTMCell, self).__init__(**kwargs)
    
    # Build is called to initialize the variables that our cell will use. We will let other Keras
    # classes (e.g. "Dense") actually initialize these variables.
    @tf_utils.shape_type_conversion
    def build(self, input_shape):        
        # Converts the input sequence into a sequence which can be matched up to the internal
        # hidden state.
        self.dense_constant = TimeDistributed(Dense(self.units, name="AttLstmInternal_DenseConstant"))
        
        # Transforms the internal hidden state into something that can be used by the attention
        # mechanism.
        self.dense_state = Dense(self.units, name="AttLstmInternal_DenseState")
        
        # Transforms the combined hidden state and converted input sequence into a vector of
        # probabilities for attention.
        self.dense_transform = Dense(1, name="AttLstmInternal_DenseTransform")
        
        # We will augment the input into LSTMCell by concatenating the context vector. Modify
        # input_shape to reflect this.
        batch, input_dim = input_shape[0]
        batch, timesteps, context_size = input_shape[-1]
        lstm_input = (batch, input_dim + context_size)
        
        # The LSTMCell superclass expects no constant input, so strip that out.
        return super(AttentionLSTMCell, self).build(lstm_input)
    
    # This must be called before call(). The "input sequence" is the output from the 
    # encoder. This function will do some pre-processing on that sequence which will
    # then be used in subsequent calls.
    def setInputSequence(self, input_seq):
        self.input_seq = input_seq
        self.input_seq_shaped = self.dense_constant(input_seq)
        self.timesteps = tf.shape(self.input_seq)[-2]
    
    # This is a utility method to adjust the output of this cell. When attention mode is
    # turned on, the cell outputs attention probability vectors across the input sequence.
    def setAttentionMode(self, mode_on=False):
        self.attentionMode = mode_on
    
    # This method sets up the computational graph for the cell. It implements the actual logic
    # that the model follows.
    def call(self, inputs, states, constants):
        # Separate the state list into the two discrete state vectors.
        # ytm is the "memory state", stm is the "carry state".
        ytm, stm = states
        # We will use the "carry state" to guide the attention mechanism. Repeat it across all
        # input timesteps to perform some calculations on it.
        stm_repeated = K.repeat(self.dense_state(stm), self.timesteps)
        # Now apply our "dense_transform" operation on the sum of our transformed "carry state" 
        # and all encoder states. This will squash the resultant sum down to a vector of size
        # [batch,timesteps,1]
        # Note: Most sources I encounter use tanh for the activation here. I have found with this dataset
        # and this model, relu seems to perform better. It makes the attention mechanism far more crisp
        # and produces better translation performance, especially with respect to proper sentence termination.
        combined_stm_input = self.dense_transform(
            keras.activations.relu(stm_repeated + self.input_seq_shaped))
        # Performing a softmax generates a log probability for each encoder output to receive attention.
        score_vector = keras.activations.softmax(combined_stm_input, 1)
        # In this implementation, we grant "partial attention" to each encoder output based on 
        # it's log probability accumulated above. Other options would be to only give attention
        # to the highest probability encoder output or some similar set.
        context_vector = K.sum(score_vector * self.input_seq, 1)
        
        # Finally, mutate the input vector. It will now contain the traditional inputs (like the seq2seq
        # we trained above) in addition to the attention context vector we calculated earlier in this method.
        inputs = K.concatenate([inputs, context_vector])
        
        # Call into the super-class to invoke the LSTM math.
        res = super(AttentionLSTMCell, self).call(inputs=inputs, states=states)
        
        # This if statement switches the return value of this method if "attentionMode" is turned on.
        if(self.attentionMode):
            return (K.reshape(score_vector, (-1, self.timesteps)), res[1])
        else:
            return res

# Custom implementation of the Keras LSTM that adds an attention mechanism.
# This is implemented by taking an additional input (using the "constants" of the
# RNN class) into the LSTM: The encoder output vectors across the entire input sequence.
class LSTMWithAttention(RNN):
    def __init__(self, units, **kwargs):
        cell = AttentionLSTMCell(units=units)
        self.units = units
        super(LSTMWithAttention, self).__init__(cell, **kwargs)
        
    @tf_utils.shape_type_conversion
    def build(self, input_shape):
        self.input_dim = input_shape[0][-1]
        self.timesteps = input_shape[0][-2]
        return super(LSTMWithAttention, self).build(input_shape) 
    
    # This call is invoked with the entire time sequence. The RNN sub-class is responsible
    # for breaking this up into calls into the cell for each step.
    # The "constants" variable is the key to our implementation. It was specifically added
    # to Keras to accomodate the "attention" mechanism we are implementing.
    def call(self, x, constants, **kwargs):
        if isinstance(x, list):
            self.x_initial = x[0]
        else:
            self.x_initial = x
        
        # The only difference in the LSTM computational graph really comes from the custom
        # LSTM Cell that we utilize.
        self.cell._dropout_mask = None
        self.cell._recurrent_dropout_mask = None
        self.cell.setInputSequence(constants[0])
        return super(LSTMWithAttention, self).call(inputs=x, constants=constants, **kwargs)

# Below is test code to validate that this LSTM class and the associated cell create a
# valid computational graph.
test = LSTMWithAttention(units=units, return_sequences=True, return_state=True)
test.cell.setAttentionMode(True)
attenc_inputs2 = Input(shape=(len_input,))
attenc_emb2 = Embedding(input_dim=vocab_in_size, output_dim=embedding_dim)
test(inputs=attenc_emb2(attenc_inputs2), constants=attenc_emb2(attenc_inputs2), initial_state=None)

[<tf.Tensor 'lstm_with_attention/transpose_1:0' shape=(?, 300, ?) dtype=float32>,
 <tf.Tensor 'lstm_with_attention/while/Exit_3:0' shape=(?, 512) dtype=float32>,
 <tf.Tensor 'lstm_with_attention/while/Exit_4:0' shape=(?, 512) dtype=float32>]

In [4]:
# Re-create an entirely new model and set of layers for the attention model

# Encoder Layers
attenc_inputs = Input(shape=(len_input,), name="attenc_inputs")
attenc_emb = Embedding(input_dim=vocab_in_size, output_dim=embedding_dim)
attenc_lstm = CuDNNLSTM(units=units, return_sequences=True, return_state=True)
attenc_outputs, attstate_h, attstate_c = attenc_lstm(attenc_emb(attenc_inputs))
attenc_states = [attstate_h, attstate_c]

attdec_inputs = Input(shape=(None,))
attdec_emb = Embedding(input_dim=vocab_out_size, output_dim=embedding_dim)
attdec_lstm = LSTMWithAttention(units=units, return_sequences=True, return_state=True)
# Note that the only real difference here is that we are feeding attenc_outputs to the decoder now.
# Nice and clean!
attdec_lstm_out, _, _ = attdec_lstm(inputs=attdec_emb(attdec_inputs), 
                                    constants=attenc_outputs, 
                                    initial_state=attenc_states)
attdec_d1 = Dense(units, activation="relu")
attdec_d2 = Dense(vocab_out_size, activation="softmax")
attdec_out = attdec_d2(Dropout(rate=.4)(attdec_d1(Dropout(rate=.4)(attdec_lstm_out))))

attmodel = Model([attenc_inputs, attdec_inputs], attdec_out)
attmodel.compile(optimizer=tf.keras.optimizers.Adam(), loss="sparse_categorical_crossentropy", metrics=['sparse_categorical_accuracy'])

In [162]:
class TLDRSequence(Sequence):

    def __init__(self, batch_size):
        self.batch_size = batch_size
        client = pymongo.MongoClient()
        self.tldr = client['test']['tldr_clean']
        self.thread = threading.Thread(target=self.prepare_data)
        self.thread.start()
        
    def prepare_data(self):
        cursor = self.tldr.aggregate([{'$sample':{'size': self.batch_size}}])
        self.input_mtx = np.zeros((self.batch_size, len_input))
        self.input_mtx[:, 0] = 2
        self.teacher_mtx = np.zeros((self.batch_size, len_output))
        self.teacher_mtx[:, 0] = 2
        self.output_mtx = np.zeros((self.batch_size, len_output))
        
        for idx, data in enumerate(cursor):
            vo = data['summary_vec_must_word']
            lo = len(vo)
            vi = data['content_vec']
            li = data['content_len']
            
            self.input_mtx[idx, 1: 1+li] = vi
            self.input_mtx[idx, 1+li] = 3
            self.teacher_mtx[idx, 1: 1+lo] = vo
            self.teacher_mtx[idx, 1+lo] = 3
            self.output_mtx[idx, 0:lo] = vo
            self.output_mtx[idx, lo] = 3
        

    def __len__(self):
        return 500

    def __getitem__(self, idx):        
        self.thread.join()
        input_mtx = self.input_mtx.copy()
        teacher_mtx = self.teacher_mtx.copy()
        output_mtx = self.output_mtx.copy()
        
        self.thread = threading.Thread(target=self.prepare_data)    
        self.thread.start()
        return [input_mtx, teacher_mtx], output_mtx[:,:,None]
    
Gen = TLDRSequence(BATCH_SIZE)

In [None]:
epochs = 200
atthist = attmodel.fit_generator(Gen, epochs=epochs, initial_epoch=55)

# Plot the results of the training.
plt.plot(atthist.history['sparse_categorical_accuracy'], label="Training loss")
plt.plot(atthist.history['val_sparse_categorical_accuracy'], label="Validation loss")
plt.show()

Epoch 56/200
 40/500 [=>............................] - ETA: 3:09 - loss: 2.1585 - sparse_categorical_accuracy: 0.6313

In [8]:
attmodel.save_weights("attention_trained_weights.h5")


## Inference With Attention
Now that we've got an attention model, let's test it, similar to above. The inference models don't change much from our seq2seq implementation, again with the exception of feeding in the encoder outputs to the decoder (which we already did in train() above).

One thing I noticed about this attention model is that it actually seems to perform worse on completely novel data (e.g. phrases that aren't at all related to the ones in the training data).

In [10]:
class LanguageIndex():
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = {}
        
    def create_index(self):
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = sorted(
            w
            for w, num in self.vocab.items()
            if num > 18500
        )
        self.word2idx["<pad>"] = 0
        self.idx2word[0] = "<pad>"
        self.word2idx["<noword>"] = 1
        self.idx2word[1] = "<noword>"
        self.word2idx["<start>"] = 2
        self.idx2word[2] = "<start>"
        self.word2idx["<end>"] = 3
        self.idx2word[3] = "<end>"
        
        for i,word in enumerate(self.vocab):
            self.word2idx[word] = i + 4
            self.idx2word[i+4] = word
            
    def update_vocab(self, text):
        self.vocab.update(text.split())
        
LangIdx = LanguageIndex()

import json
with open('vocab.txt', 'r') as f:
    LangIdx.vocab = json.load(f)

LangIdx.create_index()

In [154]:
def sentence_to_vector(sentence, lang):
    pre = "<start> " + sentence.lower() + " <end>"
    vec = np.zeros(len_input)
    sentence_list = [lang.word2idx.get(s, 1) for s in pre.split(' ')]
    for i,w in enumerate(sentence_list):
        vec[i] = w
    return vec

# Given an input string, an encoder model (infenc_model) and a decoder model (infmodel),
# return a translated string.
def translate(input_sentence, infenc_model, infmodel, attention=True):
    sv = sentence_to_vector(input_sentence, LangIdx)
    # Reshape so we can use the encoder model. New shape=[samples,sequence length]
    sv = sv.reshape(1,len(sv))
    [emb_out, sh, sc] = infenc_model.predict(x=sv)
    
    i = 0
    start_vec = LangIdx.word2idx["<start>"]
    stop_vec = LangIdx.word2idx["<end>"]
    # We will continuously feed cur_vec as an input into the decoder to produce the next word,
    # which will be assigned to cur_vec. Start it with "<start>".
    cur_vec = np.zeros((1,1))
    cur_vec[0,0] = start_vec
    cur_word = "<start>"
    output_sentence = ""
    # Start doing the feeding. Terminate when the model predicts an "<end>" or we reach the end
    # of the max target language sentence length.
    while cur_word != "<end>" and i < (len_output-1):
        i += 1
        if cur_word != "<start>":
            output_sentence = output_sentence + " " + cur_word
        x_in = [cur_vec, sh, sc]
        # This will allow us to accomodate attention models, which we will talk about later.
        if attention:
            x_in += [emb_out]
        [nvec, sh, sc] = infmodel.predict(x=x_in)
        # The output of the model is a massive softmax vector with one spot for every possible word. Convert
        # it to a word ID using argmax().
        supresswords = [*map(LangIdx.word2idx.get, 
                             ['<noword>', 'i', 'you', 'good', 'dick', 'is', 'was', 'has', 's']
                            )]
        nvec[0,0,supresswords] = 0
        cur_vec[0,0] = np.argmax(nvec[0,0])
        cur_word = LangIdx.idx2word[np.argmax(nvec[0,0])]
    return output_sentence

In [73]:
def createAttentionInference(attention_mode=False):
    # Create an inference model using the layers already trained above.
    attencoder_model = Model(attenc_inputs, [attenc_outputs, attstate_h, attstate_c])
    state_input_h = Input(shape=(units,), name="state_input_h")
    state_input_c = Input(shape=(units,), name="state_input_c")
    attenc_seq_out = Input(shape=attenc_outputs.get_shape()[1:], name="attenc_seq_out")
    inf_attdec_inputs = Input(shape=(None,), name="inf_attdec_inputs")
    attdec_lstm.cell.setAttentionMode(attention_mode)
    attdec_res, attdec_h, attdec_c = attdec_lstm(attdec_emb(inf_attdec_inputs), 
                                                 initial_state=[state_input_h, state_input_c], 
                                                 constants=attenc_seq_out)
    attinf_model = None
    if not attention_mode:
        inf_attdec_out = attdec_d2(attdec_d1(attdec_res))
        attinf_model = Model(inputs=[inf_attdec_inputs, state_input_h, state_input_c, attenc_seq_out], 
                             outputs=[inf_attdec_out, attdec_h, attdec_c])
    else:
        attinf_model = Model(inputs=[inf_attdec_inputs, state_input_h, state_input_c, attenc_seq_out], 
                             outputs=[attdec_res, attdec_h, attdec_c])
    return attencoder_model, attinf_model

attencoder_model, attinf_model = createAttentionInference()




print(translate("i really need to put together a respect thread here s a woefully inaccurate summary kefka palazzo is the final boss of final fantasy in the first half of the game he is the court mage under the evil conqueror emperor gestahl he becomes gestahl s top general leading troops to conquer nation after nation with crazy powerful magic science magitek when conquering nashe he grabs one of your crew who s powered by magitek and uses her as a puppet to genocide the town when conquering doma he gets sick of waiting and poisons their water supply eradicating everyone then he lols and heads home this actually pisses off his boss gestahl who imprisons him kefka lols his way out of prison to conquer thamasa where he murders their spirit gods espers he uses the power he gains to make a giant floating island and give it to gestahl they re buddies again your crew shows up gestahl nopes you all into ice kefka uses this opportunity to backstab gestahl he lols and tosses his boss and all your crew off his floating island some of you are caught by your team s airship some get dead like storyline death the really real actual no coming back total splat factor dead kefka lols and reks the fucking world a year passes your crew is all shitfucked and scattered across the planet kefka is lord of the god kings he pretty much just sits on his skyscraper tower lolling gloating and slaughtering people by the millions for not worshipping him", attencoder_model, attinf_model, True))


 i m a dick


In [140]:
print(translate(
    "not op and only a noob accountant but i helped a fledgling hostel with the financial section of it s business plan these guys were friends brothers who just simply pooled their money together and jumped into the hostel business because they saw an opportunity for profit if they ll ever reach profit is debatable number one expense was the labor used to construct the bathroom shower area for the guests if you take one thing away from this it s don t do what they did hire illegal unlicensed labor to do plumbing these guys paid out the ass for the initial labor and now have to pay even more because the job was done badly and water was getting under the tiles and rotting away the wood underneath as for bedding they just made a bulk order from ikea for all the bunk beds sheets washing was their other large expense they organized a contract with a laundromat to take advantage of bulk discounts most important thing location admittedly these guys had a bad location and that fact will show up in the reviews from customers they use websites such as hostelworld to fill their rooms another expense that shouldn t be skimped out on is decoration depending on the type of customer you wish to attract you need to decorate accordingly whether it s artsy hand painted illustrations cool stencils or just painted wall you need to cater to a crowd travelers who want to party shouldn t be mixed together with people who just like to hangout and site see that s my two cents at least take it with a grain of salt i ve never ran a hostel myself"
    , attencoder_model, attinf_model, True))


 you re not a good idea


In [141]:
print(translate(
"i typically play by getting rid of all of but my essentials and then trek around the wilderness for hours till i m overencumbered then drop the least valuable thing and start the trek back i only fast travel from settlements i ve built a zeppelin tower from and then it s still a long trek back that last part is more how i wished fast travel worked so i set my own limits and it does wonders for immersion also helps me discover more locations since i don t fast travel everywhere but on those long treks i could be out for hours and not have an auto save more my own fault than anything but i ve gotten stuck in those perpetually falling wedges of rock on said treks and had nobody but myself to blame for getting stuck"
    , attencoder_model, attinf_model, True))


 it s a good idea


In [158]:
diffnum = []
for i in range(1000):
    example = Gen.tldr.aggregate([{'$sample':{'size': 1}}]).next()
    gr_len = example['summary_len']
    pr_len = len(translate(example['content'], attencoder_model, attinf_model, True).split())
    diffnum.append(gr_len - pr_len)
    

In [160]:
np.histogram(diffnum, bins=[0,1,5,10,50,100])

(array([ 27, 138, 205, 484,   0]), array([  0,   1,   5,  10,  50, 100]))

1000

In [156]:
print(translate(
    "I finished Path of Daggers earlier this month and took a break to read I Am Pilgrim , I HIGHLY recommend , and I am now ready to start my journey into book 9 . One thing , I've forgotten a few plot threads . I can't search for them as possible spoilers , so what do I need to know going in ? Oh , and SPOILERS for those not up to here and please , no spoilers for me . I love this series to much for it to be ruined . I know Faile has been taken but I'm not sure about the other main characters where abouts and smaller character plots ."
        , attencoder_model, attinf_model, True))

 what do we do


One other neat thing we can do with attention is investigate what the model is paying attention to in the encoder inputs when it is performing translations. We added this functionality to our LSTMAttentionCell class already, we just need to turn it on. 

Note that when this feature is turned on, the decoder no longer outputs word IDs. This means we'll need to revert back to using our teacher data to guide the decoder through an output phrase while we track where the model is paying attention.

The below cell generates a table that is colored according to attention. Hotter, yellow colors correspond to higher attention, while darker blues correspond to less attention.

If you bother to train the full data set, play around with this a bit. I feel compelled to point out how remarkable this learned behavior is. If you use this same function with untrained weights, the matrix below is randomly distributed with great uniformity. The training process creates all of the variety that you see, and simply because we constrained our model in a particular way. This emergence of meaning and overall semantic understanding from raw data is what makes machine learning so cool to me. It truly is a thing of beauty.

In [None]:
def investigate_attention(input_sentence, output_sentence, infenc_model, infmodel):
    sv = sentence_to_vector(input_sentence, input_lang)
    # Shape=samples,sequence length
    sv = sv.reshape(1,len(sv))
    [emb_out, sh, sc] = infenc_model.predict(x=sv)
    
    outvec = sentence_to_vector(output_sentence, target_lang)
    i = 0
    cur_vec = np.zeros((1,1))
    cur_vec[0,0] = outvec[0]
    cur_word = "<start>"
    output_attention = []
    while i < (len(outvec)-1):
        i += 1
        x_in = [cur_vec, sh, sc, emb_out]
        [nvec, sh, sc] = infmodel.predict(x=x_in)
        output_attention += [nvec]
        cur_vec[0,0] = outvec[i]
    return output_attention

def plotAttention(attMatrix):
    attMatrix = np.asarray(attMatrix)
    attMatrix = np.reshape(attMatrix, (attMatrix.shape[0], attMatrix.shape[-1]))
    #print(attMatrix)
    fig = plt.figure(figsize=(5,5))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attMatrix, aspect="auto")

    plt.show()

attencoder_model, attinf_model = createAttentionInference(True)
#print(investigate_attention("I love me", attencoder_model, attinf_model, True))
#print(investigate_attention("I am hungry", attencoder_model, attinf_model, True))
plotAttention(investigate_attention("You can use a dictionary for this exam.", "Para este examen podéis usar un diccionario.", attencoder_model, attinf_model))