In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, GRU
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.models import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer

import numpy as np
import gc
import sys
from load_data import get_lookup_tables, filter_len, word2int, find_longest_sequence, limits, start_token, end_token

np.set_printoptions(threshold=sys.maxsize)
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

## Check tf version

In [2]:
tf.__version__

'2.1.0'

## Model Parameters

In [3]:
embedding_size = 120
lstm_state_size = 300
vocab_size = limits['vocab_size']
batch_size = 64
validation_split = 0.15
epochs = 10

## Import Data

In [4]:
text, summary = filter_len()
#word2int_lookup, int2word_lookup = get_lookup_tables()

## Tokenizing

In [5]:
class Tokenize(Tokenizer):
    def __init__(self, text, vocab_size):
        Tokenizer.__init__(self, num_words=vocab_size)
        self.fit_on_texts(text)
        self.int2word = {v:k for (k,v) in self.word_index.items()}
        self.word2int = self.word_index
        
    def int2word_lookup(self, int_list):
        words = [self.int2word[integer] for integer in int_list if integer != 0]
        
        return " ".join(words)
        
    def word2int_lookup(self, word_list):
        return self.texts_to_sequences(word_list)

In [None]:
text_tokenizer = Tokenize(text, limits['vocab_size'])
summary_tokenizer = Tokenize(summary, limits['vocab_size'])

In [None]:
summary[0]

In [None]:
text = text_tokenizer.word2int_lookup(text)
summary = summary_tokenizer.word2int_lookup(summary)

In [None]:
#summary[0]

### For Debugging Tokenizer Dictionary

In [None]:
summary_tokenizer.word2int

In [None]:
text_tokenizer.word2int

## Pad the Data

Zero pad text and summary. 

Padding the text: Get list of number of words for every text, use 2 standard deviations from the mean, using this length should cover 95% of all texts

Padding the summary: Find the longest summary, pad all summaries to that length

In [None]:
num_words_per_text = [len(word) for word in text]

longest_text = find_longest_sequence(text)
max_words_text = np.mean(num_words_per_text) + 2 * np.std(num_words_per_text)

print("longest text has {} words".format(longest_text))
print("using maximum of {} words".format(max_words_text))
print("this will cover {} of input text".format(np.sum(num_words_per_text < max_words_text) / len(num_words_per_text)))

max_words_text = int(max_words_text)

In [None]:
text = pad_sequences(text, maxlen=max_words_text, padding='pre')

longest_summary = find_longest_sequence(summary)
summary = pad_sequences(summary, maxlen=longest_summary, padding='post')

print(text.shape)
print(summary.shape)

In [None]:
print(text_tokenizer.int2word_lookup(text[2]))

In [None]:
print(summary[3])

## Helper function, decode integers

This function takes as input a list of words represented in integers and translates these integers into the corresponding string using the int2word dictionary that was created earlier

In [None]:
text_tokenizer.int2word_lookup(text[1])

## Prepare Data for Training (Teacher Forcing)

For every pair of text and summary, the encoder will create a final state that captures the contextual information present in the input text. The decoder will then use this final state emitted from the encoder to predict the target sequence. The decoder reads the entire target sequence word by word and predict the same sequence offset by one timestep. The decoder is trained to predict the next word in the sequence given the previous word.

https://www.analyticsvidhya.com/blog/2019/06/comprehensive-guide-text-summarization-using-deep-learning-python/
https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/21_Machine_Translation.ipynb

In [None]:
encoder_input_data = text[1:]
decoder_input_data = summary[1:, :-1]
decoder_output_data = summary[1:, 1:]

In [None]:
decoder_input_data[0]

In [None]:
decoder_output_data[0]

In [None]:
source_data = {
    'encoder_input' : encoder_input_data,
    'decoder_input' : decoder_input_data
}

target_data = {
    'decoder_output' : decoder_output_data
}

## Creating the Model



### Encoding Layer

In [None]:
encoder_input = Input(shape=(None,),name="encoder_input")
encoder_embedding = Embedding(input_dim=vocab_size,
                              output_dim=embedding_size,
                              name="encoder_embedding")

encoder_hidden1 = GRU(lstm_state_size, return_sequences=True, name="encoder_hid1")
encoder_hidden2 = GRU(lstm_state_size, return_sequences=True, name="encoder_hid2")
encoder_hidden3 = GRU(lstm_state_size, return_sequences=False, name="encoder_hid3")

In [None]:
def connect_encoding_layer():
    # connect input with embedding
    output = encoder_embedding(encoder_input)
    
    # connect embedding with hidden layers
    output = encoder_hidden1(output)
    output = encoder_hidden2(output)
    output = encoder_hidden3(output)
    
    return output

In [None]:
encoder_states = connect_encoding_layer()

### Decoding Layer

In [None]:
decoder_input = Input(shape=(None,), name="decoder_input")
decoder_embedding = Embedding(input_dim = vocab_size,
                              output_dim = embedding_size,
                              name="decoder_embedding")

decoder_hidden1 = GRU(lstm_state_size, return_sequences=True, name="decoder_hid1")
decoder_hidden2 = GRU(lstm_state_size, return_sequences=True, name="decoder_hid2")
decoder_hidden3 = GRU(lstm_state_size, return_sequences=True, name="decoder_hid3")

decoder_dense = Dense(vocab_size,
                      activation='softmax',
                      name='decoder_output')

In [None]:
def connect_decoding_layer(decoder_init_state):
    # connect input with embedding
    output = decoder_embedding(decoder_input)
    
    # connect embedding with hidden layers
    output = decoder_hidden1(output, initial_state=decoder_init_state)
    output = decoder_hidden2(output, initial_state=decoder_init_state)
    output = decoder_hidden3(output, initial_state=decoder_init_state)

    output = decoder_dense(output)
    
    return output

In [None]:
decoder_output = connect_decoding_layer(encoder_states)

## Connect the Encoder and Decoder Layers

In [None]:
model = Model(inputs=[encoder_input, decoder_input],
              outputs=[decoder_output])

model.compile(optimizer=RMSprop(lr=1e-3), loss='sparse_categorical_crossentropy')

### Callback Functions

In [None]:
checkpoint_path = "checkpoint/checkpoint.keras"
callback_checkpoint = ModelCheckpoint(filepath=checkpoint_path,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_weights_only=True,
                                      save_best_only=True)

callback_early_stopping = EarlyStopping(monitor='val_loss',
                                        patience=3, verbose=1)

callback_tensorboard = TensorBoard(log_dir='logs/',
                                   histogram_freq=0,
                                   write_graph=False)
callbacks = [callback_early_stopping,
             callback_checkpoint,
             callback_tensorboard]

In [None]:
model.summary()

In [None]:
model.fit(x=source_data,
          y=target_data,
          batch_size=batch_size,
          validation_split=validation_split,
          epochs=epochs)

## Inference Mode

First setup the encoder model

In [None]:
model_encoder = Model(inputs=[encoder_input],
                      outputs=[encoder_states])

Setup the decoder model

In [None]:
decoder_state_input_h = Input(shape=(lstm_state_size,), 
                              name='decoder_state_input_h')
decoder_state_input_c = Input(shape=(lstm_state_size,), 
                              name='decoder_state_input_c')
decoder_hidden_state_input = Input(shape=(max_words_text,lstm_state_size))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_output, decoder_state_output_h, decoder_state_output_c = connect_decoding_layer(decoder_state_inputs)

model_deocder = Model(inputs=[decoder_input,decoder_hidden_state_input] + decoder_state_inputs,
                      outputs=[decoder_output, decoder_state_output_h, decoder_state_output_c])

### Helper Function Used to Predict the Summary of Input Text

In [None]:
def summarize(input_text):
    input_ints = word2int(input_text, word2int_lookup)
    input_ints = pad_sequences(input_ints, maxlen=int(max_words_text), padding='pre')
    
    # Obtain the output and states of the encoder using the input text
    encoder_out, encoder_state_h, encoder_state_c = model_encoder.predict(input_ints)
    encoder_outputs = [encoder_out, encoder_state_h, encoder_state_c]
   
    # Data that will be fed into the decoder and initialize sample token to start token
    decoder_input_data = np.zeros(shape=(1,max_words_text), dtype=np.int)
    sampled_token_int = word2int_lookup[start_token]
    
    # Initialize predicted text and keep track of number of words processed
    predicted_summary = list()
    predicted_summary.append(start_token)
    count_tokens = 0
    
    while sampled_token_int != word2int_lookup[end_token] and count_tokens < max_words_text:
        # add the next token to the input data
        decoder_input_data[0, count_tokens] = sampled_token_int
        
        input_data = {
            'decoder_initial_state' : encoder_outputs,
            'decoder_input' : decoder_input_data
        }
        
        # use decoder to get output tokens
        output_tokens, _, _ = model_decoder.predict(input_data)
        
        # get last predicted token as one hot array
        last_token = output_tokens[0, count_tokens, :]
        
        # convert to int
        sampled_token_int = np.argmax(last_token)
        
        # convert to word
        sampled_word = int2word_lookup[sampled_token_int]
        
        # add to predicted summary
        predicted_summary.append(sampled_word)
        count_tokens += 1
    
    predicted_summary = " ".join(predicted_summary)
    
    print(predicted_summary)