# 1. Import the Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import string
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.utils import np_utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input,LSTM,Embedding,Dense,Concatenate,TimeDistributed,Bidirectional,Attention, Lambda
from tensorflow.keras.models import Model, Sequential,load_model
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow.keras.backend as k

# 2. Developing the Attentive Layer 

We will use this as a last layer to enchance the accuracy of output prediction 

In [None]:
import tensorflow as tf
from tensorflow.python.keras import backend as K

logger = tf.get_logger()

class AttentionLayer(tf.keras.layers.Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs

        logger.debug(f"encoder_out_seq.shape = {encoder_out_seq.shape}")
        logger.debug(f"decoder_out_seq.shape = {decoder_out_seq.shape}")

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state
            inputs: (batchsize * 1 * de_in_dim)
            states: (batchsize * 1 * de_latent_dim)
            """

            logger.debug("Running energy computation step")

            if not isinstance(states, (list, tuple)):
                raise TypeError(f"States must be an iterable. Got {states} of type {type(states)}")

            encoder_full_seq = states[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch size * en_seq_len * latent_dim
            W_a_dot_s = K.dot(encoder_full_seq, self.W_a)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim

            logger.debug(f"U_a_dot_h.shape = {U_a_dot_h.shape}")

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h)

            logger.debug(f"Ws_plus_Uh.shape = {Ws_plus_Uh.shape}")

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            logger.debug(f"ei.shape = {e_i.shape}")

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """

            logger.debug("Running attention vector computation step")

            if not isinstance(states, (list, tuple)):
                raise TypeError(f"States must be an iterable. Got {states} of type {type(states)}")

            encoder_full_seq = states[-1]

            # <= batch_size, hidden_size
            c_i = K.sum(encoder_full_seq * K.expand_dims(inputs, -1), axis=1)

            logger.debug(f"ci.shape = {c_i.shape}")

            return c_i, [c_i]

        # we don't maintain states between steps when computing attention
        # attention is stateless, so we're passing a fake state for RNN step function
        fake_state_c = K.sum(encoder_out_seq, axis=1)
        fake_state_e = K.sum(encoder_out_seq, axis=2)  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e], constants=[encoder_out_seq]
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c], constants=[encoder_out_seq]
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

# 3. Loading pre-computed Data

Below here we are loading :
1. pre-processed Dataset
2. pre-computed train and test arrays
3. pre-computed tokenizers

In [None]:
df_train=pd.read_csv('gigaword_train.csv')
df_test=pd.read_csv('gigaword_test.csv')

In [None]:
df_train.head()

In [None]:
max_text_len=27
max_summary_len=13

In [None]:
from numpy import load

x_train = load(r'x_train.npy')
x_test = load(r'x_test.npy')
x_valid = load('x_validate.npy')
y_train = load(r'y_train.npy')
y_test = load(r'y_test.npy')
y_valid=load('y_validate.npy')

In [None]:
import pickle
with open(r'x_tokenizer.pickle', 'rb') as handle:
    x_tokenizer = pickle.load(handle)

with open(r'y_tokenizer.pickle', 'rb') as handle:
    y_tokenizer = pickle.load(handle)

x_vocab = x_tokenizer.num_words +1
y_vocab = y_tokenizer.num_words +1

print(f'X-Vocab: {x_vocab} \nY-Vocab: {y_vocab}')

In [None]:
word2id = x_tokenizer.word_index
id2word = x_tokenizer.index_word
# print(x_tokenizer.index_word)
vocab_size = x_vocab
emded_size=300
window_size=5

## Creating the function which will produce training batches

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = tf.keras.utils.to_categorical(label_word, vocab_size)
            yield (x, y)
            
            
# Test this out for some samples
i = 0
for x, y in generate_context_word_pairs(corpus=x_train, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

# 4. Creating the Word2Vec model

This model will be used as a first layer in our PG Model

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as k
from tensorflow.keras.layers import Input,LSTM,Embedding,Dense,Concatenate,TimeDistributed,Bidirectional,Attention, Lambda
from tensorflow.keras.models import Model, Sequential,load_model

k.clear_session()
# with strategy.scope():
with tf.device('/gpu:2'):
    cbow = Sequential();
    cbow.add(Embedding(input_dim=vocab_size, output_dim=emded_size, input_length=window_size*2));
    cbow.add(Lambda(lambda x: k.mean(x, axis=1), output_shape=(emded_size,)));
    cbow.add(Dense(vocab_size, activation='softmax'));
    cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop');

# view model summary
print(cbow.summary());

In [None]:
with tf.device('/gpu:2'):
    weights = cbow.get_weights()[0]
    weights = weights[1:]
    weights

## Creating the embedding layer

In [None]:
wv_layer = Embedding(vocab_size-1,
                     300,
                     mask_zero=False,
                     weights=[weights],
                     input_length=max_text_len,
                     trainable=True)

# 5. Creating PG Model

Creating the architecture of PG model with the latent_dim of 256 and embedding dimensionality of 300 using bi-directional LSTM layers as encoders and decoders as stated in research paper

In [None]:
k.clear_session()

latent_dim = 256
embedding_dim = 300

with tf.device('/gpu:2'):
    # Encoder
    encoder_inputs = Input(shape=(max_text_len, ),name='Encoder_Inputs')

    # Embedding layer
    enc_emb = wv_layer(encoder_inputs)

    # Encoder LSTM 1 
    encoder_lstm1 = Bidirectional(LSTM(latent_dim, return_sequences=True,
                         return_state=True, dropout=0.2,
                         recurrent_dropout=0.2,name='Encoder_BiLSTM_Layer1'))
    (encoder_output1, forward_state_h1, forward_state_c1,backward_state_h1,backward_state_c1) = encoder_lstm1(enc_emb)
    state_h1=Concatenate()([forward_state_h1,backward_state_h1])
    state_c1=Concatenate()([forward_state_c1,backward_state_c1])

    # Encoder LSTM 2
    encoder_lstm2 = Bidirectional(LSTM(latent_dim, return_sequences=True,
                         return_state=True, dropout=0.2,
                         recurrent_dropout=0.2,name='Encoder_BiLSTM_Layer2'))
    (encoder_outputs, forward_state_h2, forward_state_c2,backward_state_h2,backward_state_c2) = encoder_lstm2(encoder_output1)
    state_h=Concatenate()([forward_state_h2,backward_state_h2])
    state_c=Concatenate()([forward_state_c2,backward_state_c2])

    # Set up the decoder, using encoder_states as the initial state
    decoder_inputs = Input(shape=(None, ),name='Decoder_Inputs')

    # Embedding layer
    dec_emb_layer = Embedding(y_vocab, embedding_dim, trainable=True, name='Decoder_Embedding_Inputs')
    dec_emb = dec_emb_layer(decoder_inputs)

    # Decoder LSTM1
    decoder_lstm = LSTM(latent_dim*2, return_sequences=True,
                        return_state=True, dropout=0.2,
                        recurrent_dropout=0.2,name='Decoder_LSTM_Layer')
    (decoder_outputs, decoder_fwd_state, decoder_back_state) = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

    # Attention Layer
    attn_layer = AttentionLayer(name='Attention_Layer')
    attn_out,attn_states=attn_layer([encoder_outputs,decoder_outputs])

    decoder_concat_input=Concatenate(axis=-1,name='Concat_layer')([decoder_outputs,attn_out])
    # Dense layer
    decoder_dense = TimeDistributed(Dense(y_vocab, activation='softmax',name='TimeDistribution_Layer'))
    decoder_outputs = decoder_dense(decoder_concat_input)

    # Define the model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs,name='PG_Model')

    model.summary()

## Creating the learning rate scheduler

This will reduce the learning rate by 25% after each epoch. Initially, lr is set to 0.002 as stated in research paper

In [None]:
from tensorflow.keras.callbacks import LearningRateScheduler
import math
# learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.002
    drop = 0.75
    epochs_drop = 1.0
    lrate = initial_lrate * math.pow(drop, math.floor((epoch)/epochs_drop))
    return lrate

## Creating Adam Optimizer

Creating Adam optimizer with loss evaluation as 'sparse_categorical_crossentropy'

In [None]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0), loss='sparse_categorical_crossentropy')
lrate = LearningRateScheduler(step_decay,verbose=1)

## Creating Early Stopping

It will stop training the model if the val_loss does not decreases after 4 consecutive epochs

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=4)

checkpoint_filepath = 'PG Model'
checkpoint= tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_loss',
    mode='min',
    verbose=1,
    save_best_only=True)

In [None]:
with tf.device('/gpu:2'):
    history = model.fit(
        [x_train, y_train[:, :-1]],
        y_train.reshape(y_train.shape[0], y_train.shape[1], 1)[:, 1:],
        initial_epoch=7,
        epochs=15,
        callbacks=[es,checkpoint,lrate],
        batch_size=64,
        shuffle=True,
        use_multiprocessing=True,
        workers=-1,
        validation_data=([x_valid, y_valid[:, :-1]],
                         y_valid.reshape(y_valid.shape[0], y_valid.shape[1], 1)[:
                         , 1:]),
        )

In [None]:
from matplotlib import pyplot
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

Next, let’s build the dictionary to convert the index to word for target and source vocabulary:

In [None]:
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = x_tokenizer.index_word
target_word_index = y_tokenizer.word_index

## Inference

Set up the inference for the encoder and decoder:

In [None]:
# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, state_h, state_c])

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim*2,))
decoder_state_input_c = Input(shape=(latent_dim*2,))
decoder_hidden_state_input = Input(shape=(max_text_len,latent_dim*2))

# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs) 
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])

#attention inference
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_inf_concat) 

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

Let us define the functions to convert an integer sequence to a word sequence for summary as well as the reviews

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_summary_len-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [None]:
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']):
            newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString

In [None]:
for i in range(0, 2):
    print("Review:",seq2text(x_test[i]))
    print("Original summary:",seq2summary(y_test[i]))
    print("Predicted summary:",decode_sequence(x_test[i].reshape(1,max_text_len)))
    print("\n")

# 6. Rouge Scores

Calculating the Rouge scores and saving it in CSV file for Graph Plotting and Analysis.

In [None]:
from rouge import Rouge
ROUGE = Rouge()
Original_summary = list()
Predicted_summary = list()
for i in range(len(x_test)):
    Review = seq2text(x_test[i])
    Original_summary.append(seq2summary(y_test[i]))
    Predicted_summary.append(decode_sequence(x_test[i].reshape(1,max_text_len)))
    
scores = ROUGE.get_scores(Predicted_summary,Original_summary,avg=True)
print(scores)

In [None]:
df = pd.DataFrame(scores)
df.to_csv('./PG_Results.csv')
print(df)