In [1]:
import os
import re
import pickle
import string
import unicodedata
from random import randint

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#nlk libraries
from nltk.corpus import stopwords
from wordcloud import STOPWORDS, WordCloud

# sklearn Libraries 
from sklearn.model_selection import train_test_split

# tensorflow Libraries
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, TimeDistributed

In [2]:
!pip install -q contractions==0.0.48

In [3]:
CONTRACTION_MAP = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" } 

In [4]:
# Using TPU

# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [5]:
fn1 = '../input/news-summary/news_summary.csv'
fn2 = '../input/news-summary/news_summary_more.csv'

In [6]:
df1 = pd.read_csv(fn1, encoding='iso-8859-1').reset_index(drop=True)
df2 = pd.read_csv(fn2, encoding='iso-8859-1').reset_index(drop=True)

In [7]:
df1_columns = df1.columns.tolist()
df1_columns.remove('headlines')
df1_columns.remove('text')
df1.drop(df1_columns, axis='columns', inplace=True)
df = pd.concat([df1, df2], axis='rows')
del df1, df2
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
def lemmatise_words(text, cmap=CONTRACTION_MAP):
 
    ckeys = '|'.join(cmap.keys())
    patterns = re.compile(f'({ckeys})', flags=re.DOTALL)

    def lemmatise(contraction):

        match = contraction.group(0)
        lemattised = cmap.get(match)
        if not lemmatise_words:
            return match
        return lemattised

    lemmatise_text = patterns.sub(lemmatise, text)
    lemmatise_text = re.sub("'", "", lemmatise_text)
    return lemmatise_text

In [9]:
# Lowercase
df.text = df.text.apply(str.lower)
df.headlines = df.headlines.apply(str.lower)

In [10]:
#Lemmatisation
df.headlines = df.headlines.apply(lemmatise_words)
df.text = df.text.apply(lemmatise_words)
df.sample(5)

In [11]:
# Remove puncuation from word
def punctuation_removal(word):
    clean_lst = []
    for alphabet in word:
        if alphabet not in string.punctuation:
            clean_lst.append(alphabet)
        
    return ''.join(clean_lst)

In [12]:
def rm_punc_from_text(text):
    clean_word_list = [punctuation_removal(word) for word in text]
    return ''.join(clean_word_list)

In [13]:
def number_removal(text):
    text = re.sub('[0-9]+', '', text)
    return ' '.join(text.split()) 

In [14]:
# Remove stopwords from text
def stopword_removal(text):
    _stopwords = stopwords.words('english')
    text = text.split()
    word_list = [word for word in text if word not in _stopwords]
    return ' '.join(word_list)

In [15]:
# Cleaning text
def clean_text(text):
    text = text.lower()
    text = rm_punc_from_text(text)
    text = number_removal(text)
    text = stopword_removal(text)

    text = re.sub('–', '', text)
    text = ' '.join(text.split()) 
    text = re.sub("(\\t)", ' ', str(text)).lower()
    text = re.sub("(\\r)", ' ', str(text)).lower()
    text = re.sub("(\\n)", ' ', str(text)).lower()

    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode(
        'utf-8', 'ignore'
    )

    text = re.sub("(__+)", ' ', str(text)).lower()
    text = re.sub("(--+)", ' ', str(text)).lower()
    text = re.sub("(~~+)", ' ', str(text)).lower()
    text = re.sub("(\+\++)", ' ', str(text)).lower()
    text = re.sub("(\.\.+)", ' ', str(text)).lower()

    text = re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(text)).lower()

    text = re.sub("(mailto:)", ' ', str(text)).lower()
    text = re.sub(r"(\\x9\d)", ' ', str(text)).lower()
    text = re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(text)).lower()
    text = re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM',
                  str(text)).lower()

    text = re.sub("(\.\s+)", ' ', str(text)).lower()
    text = re.sub("(\-\s+)", ' ', str(text)).lower()
    text = re.sub("(\:\s+)", ' ', str(text)).lower()
    text = re.sub("(\s+.\s+)", ' ', str(text)).lower()

    try:
        url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(text))
        repl_url = url.group(3)
        text = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', repl_url, str(text))
    except Exception as e:
        pass

    text = re.sub("(\s+)", ' ', str(text)).lower()
    text = re.sub("(\s+.\s+)", ' ', str(text)).lower()

    return text

In [16]:
df.text = df.text.apply(clean_text)
df.headlines = df.headlines.apply(clean_text)

In [17]:
df.headlines = df.headlines.apply(lambda x: f'_START_ {x} _END_')

In [18]:
start_token = 'sostok'
end_token = 'eostok'
df.headlines = df.headlines.apply(lambda x: f'{start_token} {x} {end_token}')

In [19]:
df.sample(5)

In [20]:
text_count = [len(sentence.split()) for sentence in df.text]
headlines_count = [len(sentence.split()) for sentence in df.headlines]

pd.DataFrame({'text': text_count, 'headlines': headlines_count}).hist(bins=100, figsize=(16, 4), range=[0, 50])
plt.show()

In [21]:
max_text_len = 42
max_summary_len = 13

In [22]:

def trim_text_and_summary(df, max_text_len, max_summary_len):
    cleaned_text = np.array(df['text'])
    cleaned_summary = np.array(df['headlines'])

    short_text = []
    short_summary = []

    for i in range(len(cleaned_text)):
        if len(cleaned_text[i].split()) <= max_text_len and len(
            cleaned_summary[i].split()
        ) <= max_summary_len:
            short_text.append(cleaned_text[i])
            short_summary.append(cleaned_summary[i])

    df = pd.DataFrame({'text': short_text, 'summary': short_summary})
    return df


df = trim_text_and_summary(df, max_text_len, max_summary_len)


In [23]:
def get_word_percent_which_are_very_rare(tokeniser, threshold):
    
    
    count = 0
    frequency = 0
    
    
    tf = 0
    total_count = 0

    for word, count in tokeniser.word_counts.items():
        total_count += 1
        tf += count
        if count < threshold:
            count += 1
            frequency += count

    return {
        'percent': round((count / total_count) * 100, 2)
    }

In [24]:
x_train, validation_x, y_train, validation_y = train_test_split(np.array(df['text']),np.array(df['summary']),test_size=0.1,random_state=1,shuffle=True
)

In [25]:
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(x_train))

x_tokens_data = get_word_percent_which_are_very_rare(x_tokenizer, 4)

In [26]:
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(x_train))

In [27]:
# One-hot-encoding
x_train_sequence = x_tokenizer.texts_to_sequences(x_train)
validation_x_sequence = x_tokenizer.texts_to_sequences(validation_x)

In [28]:
# Padding
x_train_padded = pad_sequences(x_train_sequence, maxlen=max_text_len, padding='post')
validation_x_padded = pad_sequences(validation_x_sequence, maxlen=max_text_len, padding='post')


In [29]:

x_size = len(x_tokenizer.word_index) + 1


In [30]:
# Y tokeniser
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y_train))


In [31]:
# one-hot-encoding
y_train_sequence = y_tokenizer.texts_to_sequences(y_train)
validation_y_sequence = y_tokenizer.texts_to_sequences(validation_y)

# padding 
y_train_padded = pad_sequences(y_train_sequence, maxlen=max_summary_len, padding='post')
validation_y_padded = pad_sequences(validation_y_sequence, maxlen=max_summary_len, padding='post')

y_size = len(y_tokenizer.word_index) + 1

In [32]:
# removing summary which only has sostok & eostok
def remove_indices(sequence):
    lst = []
    for idx in range(len(sequence)):
        cnt=0
        for val in sequence[idx]:
            if val!=0:
                cnt+=1
        if cnt==2:
            lst.append(i)
    return lst


remove_train_indexes = remove_indices(y_train_padded)
remove_val_indexes = remove_indices(validation_y_padded)

y_train_padded = np.delete(y_train_padded, remove_train_indexes, axis=0)
x_train_padded = np.delete(x_train_padded, remove_train_indexes, axis=0)

validation_y_padded = np.delete(validation_y_padded, remove_val_indexes, axis=0)
validation_x_padded = np.delete(validation_x_padded, remove_val_indexes, axis=0)

In [33]:
dim = 240
embedding_dim = 300
num_epochs = 100

In [34]:
def get_embedding_matrix(word_tokeniser, dimesions, size=None):
    glove_file = '../input/glove6b/glove.6B.300d.txt'

    word_embeddings_idx= {}

    present = 0
    absent = 0
    
    wrd_indices = word_tokeniser.word_index
    vocabulary_words = wrd_indices.keys()
    tokens = len(vocabulary_words) + 2 if not size else size

    word_embedding_matrix = np.zeros((tokens, dimesions))

    with open(glove_file) as file:
        for line in file:

            # Get word value and its coeeficients
            word_value, coefficient = line.split(maxsplit=1)

            # Convert String to Numpy Array
            coefficient = np.fromstring(coefficient, "f", sep=" ")

            # Creating embedding dictionary
            word_embeddings_idx[word_value] = coefficient

    for word, idx in wrd_indices.items():

        vector = word_embeddings_idx.get(word)

        if vector is not None:

            word_embedding_matrix[idx] = vector

            present = present+1
        else:
            absent = absent+1

    return word_embedding_matrix


x_embedding_matrix = get_embedding_matrix(x_tokenizer, embedding_dim, x_size)
y_embedding_matrix = get_embedding_matrix(y_tokenizer, embedding_dim, y_size)

In [35]:
print(x_embedding_matrix.shape)
print(y_embedding_matrix.shape)

In [36]:
def build_lstm_model(embedding_dim, dim, max_text_len, xsize, ysize,x, y):
    with tpu_strategy.scope():

        # Encoder
        # Defining input of Encoder
        enc_input = Input(shape=(max_text_len, ))

        # Encoder Embeddings
        input_embeddings = Embedding(input_dim =xsize, output_dim=embedding_dim,
                                     embeddings_initializer=tf.keras.initializers.Constant(x),
                                   trainable=False)(enc_input)

        dec_embedding = Embedding(ysize, embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(y),
                              trainable=True)

        # Encoder 1
        lstm1 = LSTM(dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
        
        out1 = lstm1(input_embeddings)[0]

        lstm2 = LSTM(dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
        end_out, *states = lstm2(out1)

        # Decoder
        dec_input = Input(shape=(None, ))

        # Creating Embeddings for decoder
        decoder_embedding = dec_embedding(dec_input)

        # Decoder 1 - LSTM
        dec_lstm = LSTM(dim,return_sequences=True, return_state=True, dropout=0.4,recurrent_dropout=0.4)

        dec_output, *decoder_last_states = dec_lstm(decoder_embedding, initial_state=states)

        # dense layer
        decoder_dense = TimeDistributed(Dense(ysize, activation='softmax'))
        dec_output = decoder_dense(dec_output)

        model = Model([enc_input, dec_input], dec_output)
        model.compile(
            optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return {
            'model': model,
            'inputs': {'encoder': enc_input,'decoder': dec_input
            },
            'outputs': {'encoder': end_out,'decoder': dec_output
            },
            'states': {'encoder': states,'decoder': decoder_last_states
            },
            'layers': {'decoder': { 'embedding': dec_embedding,'last_dec_lstm': dec_lstm,'dense': decoder_dense
                }
            }
        }

In [37]:
seq2seq = build_lstm_model(embedding_dim, dim, max_text_len, x_size, y_size,x_embedding_matrix, y_embedding_matrix)

In [38]:
seq2seq['layers']['decoder']

In [39]:
model = seq2seq['model']

In [40]:
enc_input = seq2seq['inputs']['encoder']
encoder_output = seq2seq['outputs']['encoder']
encoder_final_states = seq2seq['states']['encoder']

In [41]:
decoder_input = seq2seq['inputs']['decoder']
decoder_output = seq2seq['outputs']['decoder']
decoder_last_states = seq2seq['states']['decoder']
decoder_embedding_layer = seq2seq['layers']['decoder']['embedding']
last_decoder_lstm = seq2seq['layers']['decoder']['last_dec_lstm']
decoder_dense = seq2seq['layers']['decoder']['dense']

In [42]:
model.layers[-2].input

In [43]:
callbacks = [
    EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2),
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=0.000001, verbose=1),
]

Use a `tuple` instead of `list` in `validation_parameter` in `model.fit()`, to know the reason reading this [post](https://stackoverflow.com/questions/61586981/valueerror-layer-sequential-20-expects-1-inputs-but-it-received-2-input-tensor).

In [44]:
final_model = model.fit(
    [x_train_padded, y_train_padded[:, :-1]],
    y_train_padded.reshape(y_train_padded.shape[0], y_train_padded.shape[1], 1)[:, 1:],
    epochs=num_epochs,
    batch_size=128 * tpu_strategy.num_replicas_in_sync,
    callbacks=callbacks,
    validation_data=(
        [validation_x_padded, validation_y_padded[:, :-1]],
        validation_y_padded.reshape(validation_y_padded.shape[0], validation_y_padded.shape[1], 1)[:, 1:]
    )
)

**Plotting model's performance**

In [45]:
# Accuracy
plt.plot(final_model.history['accuracy'][1:], label='train acc')
plt.plot(final_model.history['val_accuracy'], label='val')
plt.xlabel('Number of Epochs')
plt.ylabel('Accuracy value')

In [46]:
# Loss
plt.plot(final_model.history['loss'][1:], label='train loss')
plt.plot(final_model.history['val_loss'], label='val')
plt.xlabel('Number of Epochs')
plt.ylabel('Loss Value')

In [47]:
# Next, let’s build the dictionary to convert the index to word for target and source vocabulary:
target_idx_word= y_tokenizer.index_word
source_idx_word = x_tokenizer.index_word
target_word_index = y_tokenizer.word_index

In [48]:
def build_seq2seq_model_with_just_lstm_inference(
    max_text_len, latent_dim, encoder_input, encoder_output,
    encoder_final_states, decoder_input, decoder_output,
    decoder_embedding_layer, decoder_dense, last_decoder_lstm
):
    # Encode the input sequence to get the feature vector
    encoder_model = Model(
        inputs=encoder_input, outputs=[encoder_output] + encoder_final_states
    )

    # Decoder setup
    # Below tensors will hold the states of the previous time step
    decoder_state_input_h = Input(shape=(latent_dim, ))
    decoder_state_input_c = Input(shape=(latent_dim, ))
    decoder_hidden_state_input = Input(shape=(max_text_len, latent_dim))

    # Get the embeddings of the decoder sequence
    decoder_embedding = decoder_embedding_layer(decoder_input)

    # To predict the next word in the sequence, set the initial
    # states to the states from the previous time step
    decoder_output, *decoder_states = last_decoder_lstm(
        decoder_embedding,
        initial_state=[decoder_state_input_h, decoder_state_input_c]
    )

    # A dense softmax layer to generate prob dist. over the target vocabulary
    decoder_output = decoder_dense(decoder_output)

    # Final decoder model
    decoder_model = Model(
        [decoder_input] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c], 
        [decoder_output] + decoder_states
    )

    return (encoder_model, decoder_model)

In [49]:
max_text_len = 42
max_summary_len = 13
latent_dim = 240
embedding_dim = 300
num_epochs = 50

In [50]:
def build_hybrid_seq2seq_model_inference(
    max_text_len, latent_dim, encoder_input, encoder_output,
    encoder_final_states, decoder_input, decoder_output,
    decoder_embedding_layer, decoder_dense, last_decoder_bi_lstm
):

    # Encode the input sequence to get the feature vector
    encoder_model = Model(
        inputs=encoder_input, outputs=[encoder_output] + encoder_final_states
    )

    # Decoder setup
    # Below tensors will hold the states of the previous time step
    decoder_state_forward_input_h = Input(shape=(latent_dim, ))
    decoder_state_forward_input_c = Input(shape=(latent_dim, ))
    # decoder_state_backward_input_h = Input(shape=(latent_dim, ))
    # decoder_state_backward_input_c = Input(shape=(latent_dim, ))

    # Create the hidden input layer with twice the latent dimension,
    # since we are using bi - directional LSTM's we will get 
    # two hidden states and two cell states
    decoder_hidden_state_input = Input(shape=(max_text_len, latent_dim * 2))

    decoder_initial_state = [
        decoder_state_forward_input_h, decoder_state_forward_input_c,
        #decoder_state_backward_input_h, decoder_state_backward_input_c
    ]

    # Get the embeddings of the decoder sequence
    decoder_embedding = decoder_embedding_layer(decoder_input)

    # To predict the next word in the sequence, set the initial
    # states to the states from the previous time step
    decoder_output, *decoder_states = last_decoder_bi_lstm(
        decoder_embedding, initial_state=decoder_initial_state
    )

    # A dense softmax layer to generate prob dist. over the target vocabulary
    decoder_output = decoder_dense(decoder_output)

    # Final decoder model
    decoder_model = Model(
        [decoder_input] + [decoder_hidden_state_input] + decoder_initial_state,
        [decoder_output] + decoder_states
    )

    return (encoder_model, decoder_model)
encoder_model, decoder_model = build_seq2seq_model_with_just_lstm_inference(
    max_text_len, latent_dim, enc_input, encoder_output,
    encoder_final_states, decoder_input, decoder_output,
    decoder_embedding_layer, decoder_dense, last_decoder_lstm
)

In [51]:
encoder_model, decoder_model = build_seq2seq_model_with_just_lstm_inference(
    max_text_len, dim, enc_input, encoder_output,
    encoder_final_states, decoder_input, decoder_output,
    decoder_embedding_layer, decoder_dense, last_decoder_lstm
)

In [52]:
encoder_model.summary()

In [53]:
decoder_model.summary()

In [54]:
decoder_model.layers[-3].input

In [55]:
def decode_sequence_seq2seq_model_with_lstm_model(
    input_sequence, encoder_model, decoder_model
):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_sequence)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))

    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index[start_token]

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + [e_out, e_h, e_c]
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = target_idx_word[sampled_token_index]

        if sampled_token != end_token:
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == end_token) or (len(decoded_sentence.split()) >= (max_summary_len - 1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence

In [56]:
def seq2summary(input_sequence):
    val = ''
    for i in input_sequence:
        if (
            (i != 0 and i != target_word_index[start_token]) and
            (i != target_word_index[end_token])
        ):
            val = val + target_idx_word[i] + ' '
    return val

In [57]:
def seq2text(input_sequence):
    val = ''
    for i in input_sequence:
        if i != 0:
            val = val + source_idx_word[i] + ' '
    return val

In [58]:
def predict_text(text, decode_sequence, encoder_model, decoder_model):
    org_txt = text
    
    txt_lst = org_txt.split()

    text = clean_text([text])

    if len(txt_lst) <= max_text_len:
        text = lemmatise_words(text)
        text = clean_text(text)
        text = f'_START_ {text} _END_'
        text = f'{start_token} {text} {end_token}'

        seq = x_tokenizer.texts_to_sequences([' '.join(txt_lst)])
        padded = pad_sequences(seq, maxlen=max_text_len, padding='post')
        output_summary = decode_sequence(
            padded.reshape(1, max_text_len), encoder_model, decoder_model
        )
        return output_summary
    else:
        output_summary = ''

        while len(txt_lst) % max_text_len == 0:
            txt_lst.append('')

        lst_i = max_text_len
        for i in range(0, len(txt_lst), max_text_len):
            lst = org_txt.split()[i:i + lst_i]
            local_txt= ' '.join(lst)
            local_txt= ' '.join(
                local_txt.split()
            )  
            local_txt= expand_contractions(local_txt)
            local_txt= clean_text(local_txt)  
            local_txt= f'_START_ {local_txt} _END_'
            local_txt= f'{start_token} {local_txt} {end_token}'
            # Convert to Sequence
            _seq = x_tokenizer.texts_to_sequences([local_txt])
            # Convert to Padded
            _padded = pad_sequences(_seq, maxlen=max_text_len, padding='post')
            # predictions
            _pred = decode_sequence(
                _padded.reshape(1, max_text_len), encoder_model, decoder_model
            )
            
            # Output summary
            output_summary += ' ' + ' '.join(_pred.split()[1:-2])
            output_summary = ' '.join(output_summary.split())

        return output_summary

In [59]:
# Testing on training data
for i in range(0, 15):
    print(f"# {i+1} News text: ", seq2text(x_train_padded[i]))
    print("Original summary text: ", seq2summary(y_train_padded[i]))
    print(
        "Predicted summary text ",
        decode_sequence_seq2seq_model_with_lstm_model(
            x_train_padded[i].reshape(1, max_text_len), encoder_model,
            decoder_model
        )
    )
    print("*"*30)

In [60]:
# Testing on validation data
original = []
predicted = []
for i in range(0, 15):
    print(f"# {i+1} News Text: ", seq2text(validation_x_padded[i]))
    print("Original summary Text: ", seq2summary(validation_y_padded[i]))
    print(
        "Predicted summary Text: ",
        decode_sequence_seq2seq_model_with_lstm_model(
            validation_x_padded[i].reshape(1, max_text_len), encoder_model,
            decoder_model
        )
    )
    original.append(seq2summary(validation_y_padded[i]))
    predicted.append(decode_sequence_seq2seq_model_with_lstm_model(
            validation_x_padded[i].reshape(1, max_text_len), encoder_model,
            decoder_model
        ))
    print("*"*30)

In [None]:
# HDF5 format Saving the model
model.save('model.h5')    
encoder_model.save('encoder_model.h5')
decoder_model.save('decoder_model.h5')

In [None]:
!pip install rouge_score

In [None]:
from datasets import load_metric
metric = load_metric("rouge")

def calc_rouge_scores(candidates, references):
    result = metric.compute(predictions=candidates, references=references, use_stemmer=True)
    result = {key: round(value.mid.fmeasure * 100, 1) for key, value in result.items()}
    return result

In [None]:
for i in range (15):
    print(f"First {i+1} senctences: Scores {calc_rouge_scores(original, predicted)}")