## **Import Libraries**

In [None]:
import pandas as pd
import numpy as np  
import pandas as pd 
import re           
from bs4 import BeautifulSoup 
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords   
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import warnings
from sklearn.model_selection import train_test_split
from keras import backend as K
import matplotlib.pyplot as plt
import tensorflow as tf

import nltk
nltk.download("stopwords")

from google.colab import drive
drive.mount('/content/drive')

pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

import pdb

import rouge
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import sent_tokenize, word_tokenize

!pip install py-rouge
nltk.download('punkt')



## **Import Wikihow Dataset**

In [None]:
path = "/content/drive/My Drive/Text Summarization/wikihowAll.csv"

data = pd.read_csv(path)

print(data.head(10))

print(data.describe())

data.dropna(inplace=True)

print(data.describe())



In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

## **Text Preprocessing**


In [None]:
%%time

stop_words = nltk.corpus.stopwords.words('english')

class PreProcess():
  def __init__(self, dataset_path):
    self.dataframe = None
    self.dataset_path = dataset_path
    self.cleaned_dataframe = pd.DataFrame()
    self.x_train = None
    self.y_train = None
    self.x_validation = None
    self.y_validation = None
    self.x_train_token = None
    self.y_train_token = None
    self.x_validation_token = None
    self.y_validation_token = None
    self.x_train_feature = None
    self.y_train_label = None
    self.x_validation_feature = None
    self.y_validation_label = None

    self.minimum_word_length = 3
    self.max_len_text = 0
    self.max_len_summary = 0

    self.threshold = 0.8
    self.words_description_dataframe = None
    self.baseline_model_dataframe = pd.DataFrame()

    self.split_percentage = 0.1
    self.x_tokenizer = None
    self.y_tokenizer = None

    print("PreProcessing Initialized")

  def import_dataset(self):
    print("Importing Dataset Started !!!!")
    self.dataframe = pd.read_csv(self.dataset_path)
    self.dataframe.dropna(inplace=True)
    print("Importing Dataset Finished !!!!")

  def print_original_data(self):
    print(self.dataframe.head(3))
    
  def print_processed_data(self):
    print(self.cleaned_dataframe.head(3))

  def return_cleaned_dataframe(self):
    return self.cleaned_dataframe

  def remove_stopwords(self, text):
    return [w for w in text.split() if not w in stop_words]

  def parse_html_text(self, text):
    return BeautifulSoup(text, "lxml").text

  def remove_special_charecters(self, text):
    return re.sub(r'\([^)]*\)', '', text)

  def remove_double_quotes(self, text):
    return re.sub('"','', text)

  def map_different_words(self, text):
    return' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")]) 

  def filter_only_text(self, text):
    text = re.sub(r"'s\b","",text)
    text = re.sub("[^a-zA-Z]", " ", text)
    return text
  
  def give_text_with_punct(self, text):
    alphanumeric = re.sub( '[^a-z0-9.]', ' ', text)
    return alphanumeric

  def clean_text(self, text):

    lower_text = text.lower()

    parsed_html_text = self.parse_html_text(lower_text)

    special_charecters_removed_text = self.remove_special_charecters(parsed_html_text)

    double_quotes_removed_text = self.remove_double_quotes(special_charecters_removed_text)

    mapped_text = self.map_different_words(double_quotes_removed_text)
       
    filtered_text = self.filter_only_text(mapped_text)

    tokens = self.remove_stopwords(filtered_text)

    sentence=[]
    for token in tokens:
        if len(token)>=self.minimum_word_length:
            sentence.append(token)

    return (" ".join(sentence)).strip()

  def clean_text_withp(self, text):

    lower_text = text.lower()

    parsed_html_text = self.parse_html_text(lower_text)

    special_charecters_removed_text = self.remove_special_charecters(parsed_html_text)

    double_quotes_removed_text = self.remove_double_quotes(special_charecters_removed_text)

    filtered_with_punct_text = self.give_text_with_punct(double_quotes_removed_text)
    
    mapped_text = self.map_different_words(filtered_with_punct_text)
       
    tokens = self.remove_stopwords(mapped_text)

    sentence=[]
    for token in tokens:
        if len(token)>=self.minimum_word_length:
            sentence.append(token)
    return (" ".join(sentence)).strip()


  def create_cleaned_dataframe(self):
    print("Dataframe text cleaning started!!!")
    cleaned_text = []
    for t in self.dataframe.text:
        cleaned_text.append(self.clean_text_withp(t))
        
    cleaned_title = []
    for t in self.dataframe.title:
        cleaned_title.append(self.clean_text_withp(t))
    
    self.cleaned_dataframe["cleaned_text"] = cleaned_text
    self.cleaned_dataframe["cleaned_summary"] = cleaned_title
    self.cleaned_dataframe['cleaned_summary'].replace('', np.nan, inplace=True)
    self.cleaned_dataframe.dropna(axis=0,inplace=True)
    print("Dataframe text cleaning finished!!!")

  def calculate_word_length_ratio(self):
    self.words_description_dataframe["ratio_summary/text"] = self.words_description_dataframe["summary"] / self.words_description_dataframe["text"]
    self.words_description_dataframe["ratio_text/summary"] = self.words_description_dataframe["text"] / self.words_description_dataframe["summary"]
    self.words_description_dataframe = self.words_description_dataframe.replace([np.inf, -np.inf], np.nan)

  def update_maximum_length_of_text_and_summary(self):
    print("Finding maximum length of text and summary started!!!")
    text_word_count = []
    summary_word_count = []

    for i in self.cleaned_dataframe['cleaned_text']:
          text_word_count.append(len(i.split()))

    for i in self.cleaned_dataframe['cleaned_summary']:
          summary_word_count.append(len(i.split()))

    self.words_description_dataframe = pd.DataFrame({'text':text_word_count, 'summary':summary_word_count})

    self.max_len_text=self.words_description_dataframe.describe().text["max"] 
    self.max_len_summary=self.words_description_dataframe.describe().summary["max"]

    self.max_len_text = int(self.max_len_text)
    self.max_len_summary = int(self.max_len_summary)
    print("Finding maximum length of text and summary finished!!!")


  def split_dataset(self):
    print("Splitting of dataset into train and text started!!!")
    self.x_train,self.x_validation,self.y_train,self.y_validation = train_test_split(
        self.cleaned_dataframe['cleaned_text'],self.cleaned_dataframe['cleaned_summary'],test_size=self.split_percentage,
                                          random_state=0,shuffle=True)
    print("Splitting of dataset into train and text finished!!!")

  def tokenize(self):
    print("tokenization of text data started!!!")
    self.x_tokenizer = Tokenizer()

    self.x_tokenizer.fit_on_texts(list(self.x_train))

    self.x_train_token = self.x_tokenizer.texts_to_sequences(self.x_train) 
    self.x_validation_token = self.x_tokenizer.texts_to_sequences(self.x_validation)

    self.y_tokenizer = Tokenizer()
    self.y_tokenizer.fit_on_texts(list(self.y_train))

    self.y_train_token = self.y_tokenizer.texts_to_sequences(self.y_train) 
    self.y_validation_token = self.y_tokenizer.texts_to_sequences(self.y_validation)

    print("tokenization of text data finished!!!")

  def convert_tokens_to_sequences(self):
    print("converting tokens into sequences started!!!")

    self.x_train_feature = pad_sequences(self.x_train_token, maxlen=self.max_len_text, padding='post')
    self.x_validation_feature = pad_sequences(self.x_validation_token, maxlen=self.max_len_text, padding='post')

    self.x_vocabulary_size   =  len(self.x_tokenizer.word_index) +1

    self.y_train_label = pad_sequences(self.y_train_token, maxlen=self.max_len_summary, padding='post')
    self.y_validation_label = pad_sequences(self.y_validation_token, maxlen=self.max_len_summary, padding='post')

    self.y_vocabulary_size  =   len(self.y_tokenizer.word_index) +1

    print("converting tokens into sequences finished!!!")


  def create_word_index_from_tokens(self):
      self.reverse_target_word_index=self.y_tokenizer.index_word 
      self.reverse_source_word_index=self.x_tokenizer.index_word 
      self.target_word_index=self.y_tokenizer.word_index
      self.reverse_target_word_index[0] = "UNKNOWN"

  def assign_ratios_to_cleaned_dataframe(self):
    self.cleaned_dataframe["ratio_text/summary"] = self.words_description_dataframe["ratio_text/summary"]
    self.cleaned_dataframe["ratio_summary/text"] = self.words_description_dataframe["ratio_summary/text"]

  def filter_dataframe_with_threshold(self):
    print("filtering dataframe with threshold has started!!!")
    self.cleaned_dataframe = self.cleaned_dataframe[self.cleaned_dataframe["ratio_summary/text"] < self.threshold]
    print("filtering dataframe with threshold has finished!!!")


  def word_frequency(self, dataset):
    pred_summary_sentence_list = []
    actual_summary_sentence_list = []
    i = 0
    for index, row in dataset.iterrows():
      try:
        sentence = row[0]
        summary = row[1]

        sentence_list = nltk.sent_tokenize(sentence)
        word_frequencies = {}
        for word in nltk.word_tokenize(sentence):
            if word not in stop_words:
                if word not in word_frequencies.keys():
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1

        maximum_frequency = max(word_frequencies.values(), default=1)
        for word in word_frequencies.keys():
              word_frequencies[word] = (word_frequencies[word]/maximum_frequency)

        sentence_scores = {}
        for sent in sentence_list:
            for word in nltk.word_tokenize(sent.lower()):
                if word in word_frequencies.keys():
                    if len(sent.split(' ')) < 5:
                        if sent not in sentence_scores:
                            sentence_scores[sent] = word_frequencies[word]
                        else:
                            sentence_scores[sent] += word_frequencies[word]

        sorted_scores = sorted(sentence_scores.items(), key=lambda f: f[1])
        sentence_summary = None
        if(len(sorted_scores) > 0):
          sentence_summary = sorted_scores[-1][0]
        else:
          sentence_summary = ""

        pred_summary_sentence_list.append(sentence_summary)
        actual_summary_sentence_list.append(summary)
        for key in sentence_scores:
          summary_sentences = heapq.nlargest(1, sentence_scores, key=sentence_scores.get)
      except Exception as e:
        print(e)
        pass
      i = i + 1
    return pred_summary_sentence_list, actual_summary_sentence_list
    
  def create_word_freq(self):
    print("Create word frequency has started :-)")
    self.predicted_summaries, self.actual_summaries = self.word_frequency(self.cleaned_dataframe)
    print("Create word frequency has finished :-)")

  def assign_text_to_baseline_df(self):
    print("Create baseline dataframe has started :-)")
    self.baseline_model_dataframe["actual_summaries"] = self.actual_summaries
    self.baseline_model_dataframe["predicted_summaries"] = self.predicted_summaries
    self.baseline_model_dataframe["text"] = self.cleaned_dataframe["cleaned_text"]
    print("Create baseline dataframe has finished :-)")

  def process(self):
    print("Preprocessing Started :-)")
    self.import_dataset()
    self.create_cleaned_dataframe()
    self.update_maximum_length_of_text_and_summary()
    self.calculate_word_length_ratio()
    self.assign_ratios_to_cleaned_dataframe()
    self.filter_dataframe_with_threshold()
    self.split_dataset()
    self.tokenize()
    self.convert_tokens_to_sequences()
    print("Preprocessing Finished :-)")

  def baseline(self):
    print("Baseline Started :-)")
    self.import_dataset()
    self.create_cleaned_dataframe()
    self.update_maximum_length_of_text_and_summary()
    self.calculate_word_length_ratio()
    self.assign_ratios_to_cleaned_dataframe()
    self.filter_dataframe_with_threshold()
    self.split_dataset()
    self.tokenize()
    self.convert_tokens_to_sequences()
    self.create_word_freq()
    self.assign_text_to_baseline_df()
    print("Baseline Finished :-)")

CPU times: user 1.49 ms, sys: 2 µs, total: 1.49 ms
Wall time: 1.5 ms


In [None]:
#@title
#%%time
preprocess_object = PreProcess(path)
preprocess_object.process()
#preprocess_object.print_original_data()
#preprocess_object.print_processed_data()
preprocess_object.create_word_index_from_tokens()

PreProcessing Initialized
Preprocessing Started :-)
Importing Dataset Started !!!!
Importing Dataset Finished !!!!
Dataframe text cleaning started!!!
Dataframe text cleaning finished!!!
Finding maximum length of text and summary started!!!
Finding maximum length of text and summary finished!!!
filtering dataframe with threshold has started!!!
filtering dataframe with threshold has finished!!!
Splitting of dataset into train and text started!!!
Splitting of dataset into train and text finished!!!
tokenization of text data started!!!
tokenization of text data finished!!!
converting tokens into sequences started!!!
converting tokens into sequences finished!!!
Preprocessing Finished :-)


In [None]:
preprocess_object.x_train_feature.shape

(185587, 6529)

In [None]:
preprocess_object.y_train_label.shape

(185587, 15)

In [None]:
preprocess_object.dataframe.head()

Unnamed: 0,headline,title,text
0,"\nKeep related supplies in the same area.,\nMake an effort to clean a dedicated workspace after every session.,\nPlace loose supplies in large, clearly visible containers.,\nUse clotheslines and c...",How to Be an Organized Artist1,"If you're a photographer, keep all the necessary lens, cords, and batteries in the same quadrant of your home or studio. Paints should be kept with brushes, cleaner, and canvas, print supplies sh..."
1,"\nCreate a sketch in the NeoPopRealist manner of the future mural on a small piece of paper 8""x10"" using the black ink pen.,\nPrepare to create your NeoPopRealist mural.,\nPrepare your paint.,\nBe...",How to Create a Neopoprealist Art Work,"See the image for how this drawing develops step-by-step. However, there is an important detail: the following drawings are to examine it, and then, to create something unique.\n\n\nUse the lines..."
2,"\nGet a bachelor’s degree.,\nEnroll in a studio-based program.,\nTrain on a number of VFX computer programs.,\nWatch online tutorials.,\nNurture your artistic side.,\nPay close attention to movies...",How to Be a Visual Effects Artist1,"It is possible to become a VFX artist without a college degree, but the path is often easier with one. VFX artists usually major in fine arts, computer graphics, or animation. Choose a college wi..."
3,"\nStart with some experience or interest in art.,\nUnderstand the difference between art collectors, art investors and art speculators.,\nFigure out what you are willing to pay for art, before goi...",How to Become an Art Investor,"The best art investors do their research on the pieces of art that they buy, so someone with some education or interest in the art world is more likely to understand this niche market. As well as..."
4,"\nKeep your reference materials, sketches, articles, photos, etc, in one easy to find place.,\nMake ""studies,"" or practice sketches, to organize effectively for larger projects.,\nLimit the suppli...",How to Be an Organized Artist2,"As you start planning for a project or work, you'll likely be gathering scraps of inspiration and test sketches. While everyone has a strategy, there is nothing more maddening than digging throug..."


In [None]:
preprocess_object.cleaned_dataframe.describe()

Unnamed: 0,ratio_text/summary,ratio_summary/text
count,206208.0,206208.0
mean,77.565887,0.052816
std,101.953947,0.094736
min,1.285714,0.000321
25%,21.666667,0.010929
50%,47.333333,0.021127
75%,91.5,0.046154
max,3119.0,0.777778


## **Baseline Model**

In [None]:
preprocess_object = PreProcess(path)
preprocess_object.process()

actual_summaries, predicted_summaries = preprocess_object.word_frequency()

baseline_model_dataframe = pd.DataFrame()
baseline_model_dataframe["actual_summaries"] = actual_summaries
baseline_model_dataframe["predicted_summaries"] = predicted_summaries
baseline_model_dataframe["text"] = preprocess_object.cleaned_dataframe.iloc[0:dataset_length,]["cleaned_text"]

## **Baseline Model Evaluation**


In [None]:
import rouge

class Evaluation:
  def __init__(self, name, actual_summaries, predicted_summaries):
    self.name = "rouge"
    self.actual_summaries = actual_summaries
    self.predicted_summaries = predicted_summaries
    self.synonym_summaries = None

  def rouge_score(self):
    aggregator = "Best"
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'
    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=4,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)
    scores = evaluator.get_scores(predicted_summaries, actual_summaries)
    print(scores)
    return scores

  def evaluate(self):
    if self.name == "rouge":
      self.rouge_score()

In [None]:
evaluation_object = Evaluation("rouge", actual_summaries, predicted_summaries)

evaluation_object.evaluate()

## **LSTM Model**

In [None]:
class TextSummarizatioModel():
  def __init__(self):
    self.model = None
    self.encoded_feature_dimension = 350
    # self.model_weight_path = "/content/drive/My Drive/Leuphana Projects/Text Summarization/ModelWeights/refactored_code_model_weights.hd5"
    self.model_weight_path = "/content/drive/My Drive/Leuphana Projects/Text Summarization/ModelWeights/final_model_version_3_weights_epoch_28.hd5"
    # self.saved_model_path = "/content/drive/My Drive/Leuphana Projects/Text Summarization/ModelWeights/refactored_code_model.hd5"
    self.saved_model_path = "/content/drive/My Drive/Leuphana Projects/Text Summarization/ModelWeights/small_refactored_code_model_temp.hd5"
    #self.load_model_weights_path = "/content/drive/My Drive/Leuphana Projects/Text Summarization/ModelWeights/200000_dataset_model_with_256_batch_and_3_epoch.hd5"
    #self.load_model_weights_path = "/content/drive/My Drive/Leuphana Projects/Text Summarization/ModelWeights/small_refactored_code_model_weights.hd5"
    self.load_model_weights_path = "/content/drive/My Drive/Leuphana Projects/Text Summarization/ModelWeights/final_model_version_3_weights_epoch_28.hd5"

    self.tpu_model_path = "/content/drive/My Drive/Leuphana Projects/Text Summarization/ModelWeights/tpu_mode.hd5"
    self.epochs = 20
    self.batch_size = 256
    self.history = None

  def print_model_summary(self):
    self.print_asterix()
    self.model.summary()
    self.print_asterix()

  def print_asterix(self):
    print("***" * 30)

  def create_model(self):
    print("***" * 30)
    print("Creating Model Started!!!!")
    K.clear_session()

    # Encodeing Layer
    self.encoder_inputs = Input(shape=(preprocess_object.max_len_text,)) 

    # add Embedding layer, each word will have embedding vector of size encoded_feature_dimensions
    self.enc_emb = Embedding(preprocess_object.x_vocabulary_size, 
                             self.encoded_feature_dimension,trainable=True)(self.encoder_inputs) 


    self.encoder_lstm1 = LSTM(self.encoded_feature_dimension,return_sequences=True,return_state=True) 
    self.encoder_output1, self.state_h1, self.state_c1 = self.encoder_lstm1(self.enc_emb) 

    self.encoder_lstm2=LSTM(self.encoded_feature_dimension, return_state=True, return_sequences=True) 
    self.encoder_outputs, self.state_h, self.state_c= self.encoder_lstm2(self.encoder_output1) 

    #Decoding Layer
    self.decoder_inputs = Input(shape=(None,)) 
    self.dec_emb_layer = Embedding(preprocess_object.y_vocabulary_size, self.encoded_feature_dimension,trainable=True) 
    self.dec_emb = self.dec_emb_layer(self.decoder_inputs) 

    self.decoder_lstm = LSTM(self.encoded_feature_dimension, return_sequences=True, return_state=True) 
    self.decoder_outputs,self.decoder_fwd_state, self.decoder_back_state = self.decoder_lstm(
        self.dec_emb,initial_state=[self.state_h, self.state_c]) 

    self.decoder_dense = TimeDistributed(Dense(preprocess_object.y_vocabulary_size, activation='softmax'))
    self.decoder_outputs = self.decoder_dense(self.decoder_outputs) 

    self.model = Model([self.encoder_inputs, self.decoder_inputs], self.decoder_outputs) 
    self.model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    print("Creating Model Finished!!!!")
    print("***" * 30)

  def train(self, x_train_feature, x_validation_feature, y_train_feature, y_validation_feature):
    self.print_asterix()
    print("Model training started!!!!")
    #compile the model
    self.model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    #add early stopping to save intermediate states
    self.es = EarlyStopping(monitor='loss', mode='min', verbose=1)
    self.checkpoint = ModelCheckpoint(self.model_weight_path, save_best_only=True, monitor='loss', mode='min')


    self.history=self.model.fit([x_train_feature,y_train_feature[:,:-1]], 
                           y_train_feature.reshape(y_train_feature.shape[0],
                                                   y_train_feature.shape[1], 1)[:,1:] ,
                    epochs=self.epochs,callbacks=[self.es, self.checkpoint],batch_size=self.batch_size)

    print("Model training Finished!!!!")
    self.print_asterix()

  def save_model(self):
    self.print_asterix()
    print("Saving Model!!!!!!!!!!")
    self.model.save(self.saved_model_path)
    print("Model saved successfully!!!!!!!!!!")
    self.print_asterix()

  def load_model(self):
    self.print_asterix()
    print("Loading Model started!!!!!!!!!!")
    self.model.load_weights(self.load_model_weights_path)
    print("Loading Model finished!!!!!!!!!!")
    self.print_asterix()

  def print_train_history(self): 
    self.print_asterix()
    plt.title("Model Trained History")
    plt.plot(self.history.history['loss'], label='train') 
    plt.plot(self.history.history['val_loss'], label='test') 
    plt.legend()
    plt.show()
    self.print_asterix()

  def evaluation_model(self):
    self.print_asterix()
    print("Creating Evaluation Model Started!!!")
    self.encoder_model = Model(inputs= self.encoder_inputs,outputs=[self.encoder_outputs, self.state_h, self.state_c])
  
    decoder_state_input_h = Input(shape=(self.encoded_feature_dimension,))
    decoder_state_input_c = Input(shape=(self.encoded_feature_dimension,))
    decoder_hidden_state_input = Input(shape=(preprocess_object.max_len_text,self.encoded_feature_dimension))
    dec_emb2= self.dec_emb_layer(self.decoder_inputs)

    decoder_outputs2, state_h2, state_c2 = self.decoder_lstm(dec_emb2, 
                                                        initial_state=[decoder_state_input_h, decoder_state_input_c])

    decoder_inf_concat = decoder_outputs2
    decoder_outputs2 = self.decoder_dense(decoder_inf_concat)

    self.decoder_model = Model([self.decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, 
                                                        decoder_state_input_c],[decoder_outputs2] + [state_h2, state_c2])

    print("Creating Evaluation Model Finished!!!")
    self.print_asterix()

  def decode_sequence(self, input_seq):
    # Use greedy method to find the next predicted words
    # Take the index of the vocabulary from the maximum probability value of the predicted output from the decoder
    # convert the index to word using the vocabulary dictionary built earlier
    # returns the output in the text form

    e_out, e_h, e_c = self.encoder_model.predict(input_seq)

    #print("Encoder Output: {}".format(e_out))

    target_seq = np.zeros((1,1))

    # first input should be start for the decoder model so that it will start to predict the next words based on start and hidden states from the encoders
    target_seq[0, 0] = preprocess_object.target_word_index['start']

    stop_condition = False
    decoded_sentence = ''

    i = 0
    while not stop_condition:
        i = i + 1
        #print("Iteration : {}".format(i))

        # print("Input to decoder: {}".format([[target_seq] + [e_out, e_h, e_c]]))

        output_tokens, h, c = self.decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        #print("Decoder output: {}".format(output_tokens))

        # pdb.set_trace()
        # print(output_tokens)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        #print("Sampled Token Index: {}".format(sampled_token_index))
        
        # if sampled_token_index == 0 and i == 25:
        if sampled_token_index == 0:
          stop_condition = True
          continue
        
        sampled_token = preprocess_object.reverse_target_word_index[sampled_token_index]
        
        if(sampled_token!='end'):
            decoded_sentence += ' '+sampled_token

            if (sampled_token == 'end' or len(decoded_sentence.split()) >= (preprocess_object.max_len_summary-1)):
                #print("Decode sentence: {}".format(decoded_sentence))
                print(len(decoded_sentence.split()))
                stop_condition = True

        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        e_h, e_c = h, c

    return decoded_sentence

  def seq2summary(self, input_seq):
    newString=''
    for i in input_seq:
      if((i!=0 and i!=preprocess_object.target_word_index['start']) and i!=preprocess_object.target_word_index['end']):
        newString=newString+preprocess_object.reverse_target_word_index[i]+' '
    return newString

  def seq2text(self, input_seq):
    newString=''
    for i in input_seq:
      if(i!=0):
        newString=newString+preprocess_object.reverse_source_word_index[i]+' '
    return newString

In [None]:
np.zeros((1,1))

array([[0.]])

## **LSTM Model Training**

In [None]:
#%%time
text_summary_model = TextSummarizatioModel()
text_summary_model.create_model()
text_summary_model.load_model()
text_summary_model.print_model_summary()
text_summary_model.train(preprocess_object.x_train_feature, preprocess_object.x_validation_feature, 
                       preprocess_object.y_train_label, preprocess_object.y_validation_label)
text_summary_model.save_model()
text_summary_model.evaluation_model()
text_summary_model.print_train_history()

******************************************************************************************
Creating Model Started!!!!



Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Creating Model Finished!!!!
******************************************************************************************
******************************************************************************************
Loading Model started!!!!!!!!!!
Loading Model finished!!!!!!!!!!
******************************************************************************************
******************************************************************************************
Creating Evaluation Model Started!!!
Creating Evaluation Model Finished!!!
******************************************************************************************


In [None]:
text_summary_model.evaluation_model()

******************************************************************************************
Creating Evaluation Model Started!!!
Creating Evaluation Model Finished!!!
******************************************************************************************


In [None]:
preprocess_object.x_validation_feature.shape[0]

20621

## **LSTM Model Predictions**

In [None]:
length = 3500

In [None]:
%%time

import csv
with open('/content/drive/My Drive/Leuphana Projects/Text Summarization/ModelWeights/predicted_summaries.csv', 'w', newline='') as file:
    writer = csv.writer(file)

    predicted_summaries = []
    actual_summaries = []

    for i in range(1925, length):

      if(i%100 == 0):
        print(i)

      #print("Review:",text_summary_model.seq2text(preprocess_object.x_validation_feature[i]))

      #print("Original summary:",text_summary_model.seq2summary(preprocess_object.y_validation_label[i]))

      acutual_summary = text_summary_model.seq2summary(preprocess_object.y_validation_label[i])
      actual_summaries.append(acutual_summary)

      #print("Predicted summary: ",text_summary_model.decode_sequence(
      #   preprocess_object.x_validation_feature[i].reshape(1,preprocess_object.max_len_text)))
      
      predicted_summary = text_summary_model.decode_sequence(preprocess_object.x_validation_feature[i].reshape(1,preprocess_object.max_len_text))
      predicted_summaries.append(predicted_summary)

      writer.writerow([i, acutual_summary, predicted_summary])
      #print("\n")

2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
CPU times: user 23h 17min 30s, sys: 6h 3min 59s, total: 1d 5h 21min 29s
Wall time: 3h 6min 10s


In [None]:
predicted_summary = text_summary_model.decode_sequence(preprocess_object.x_validation_feature[1925].reshape(1,preprocess_object.max_len_text))

In [None]:
predicted_summaries.append(predicted_summary)

In [None]:
prediction_dataframe = pd.DataFrame()

In [None]:
prediction_dataframe["actual_summaries"] = actual_summaries
prediction_dataframe["predicted_summaries"] = predicted_summaries

## **LSTM Model Evaluation**

In [None]:
results = pd.read_csv("/content/drive/My Drive/Leuphana Projects/Text Summarization/predictions.csv")

results.columns = ["index","actual_summaries", "predicted_summaries"]

print(results.describe())

import rouge

class Evaluation:
  def __init__(self, name, actual_summaries, predicted_summaries):
    self.name = "rouge"
    self.actual_summaries = actual_summaries
    self.predicted_summaries = predicted_summaries
    self.synonym_summaries = None

  def rouge_score(self):
    aggregator = "Best"
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'
    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=4,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)
    scores = evaluator.get_scores(predicted_summaries, actual_summaries)
    print(scores)
    return scores

  def evaluate(self):
    if self.name == "rouge":
      self.rouge_score()

In [None]:
evaluation_object = Evaluation("rouge", actual_summaries, predicted_summaries)

In [None]:
evaluation_object.evaluate()