In [1]:
from __future__ import print_function

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from seq2seq import Seq2SeqSummarizer
import numpy as np
from collections import Counter

Using TensorFlow backend.


We do not need to always load the weights

In [3]:
LOAD_EXISTING_WEIGHTS = False

Setting some hyper parameter values for the encoder and decoder networks as well as the nynber of training epochs.

In [4]:
MAX_INPUT_SEQ_LENGTH = 500
MAX_TARGET_SEQ_LENGTH = 50
MAX_INPUT_VOCAB_SIZE = 5000
MAX_TARGET_VOCAB_SIZE = 2000
EPOCHS = 400

Defining a text preprocessing method for the neural network inputs

The output of this method is the configuration of the neural network

In [5]:
def fit_text(X, Y, input_seq_max_length=None, target_seq_max_length=None):
    if input_seq_max_length is None:
        input_seq_max_length = MAX_INPUT_SEQ_LENGTH
    if target_seq_max_length is None:
        target_seq_max_length = MAX_TARGET_SEQ_LENGTH
    input_counter = Counter()
    target_counter = Counter()
    max_input_seq_length = 0
    max_target_seq_length = 0

    for line in X:
        text = [word.lower() for word in line.split(' ')]
        seq_length = len(text)
        if seq_length > input_seq_max_length:
            text = text[0:input_seq_max_length]
            seq_length = len(text)
        for word in text:
            input_counter[word] += 1
        max_input_seq_length = max(max_input_seq_length, seq_length)

    for line in Y:
        line2 = 'START ' + line.lower() + ' END'
        text = [word for word in line2.split(' ')]
        seq_length = len(text)
        if seq_length > target_seq_max_length:
            text = text[0:target_seq_max_length]
            seq_length = len(text)
        for word in text:
            target_counter[word] += 1
            max_target_seq_length = max(max_target_seq_length, seq_length)

    input_word2idx = dict()
    for idx, word in enumerate(input_counter.most_common(MAX_INPUT_VOCAB_SIZE)):
        input_word2idx[word[0]] = idx + 2
    input_word2idx['PAD'] = 0
    input_word2idx['UNK'] = 1
    input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])

    target_word2idx = dict()
    for idx, word in enumerate(target_counter.most_common(MAX_TARGET_VOCAB_SIZE)):
        target_word2idx[word[0]] = idx + 1
    target_word2idx['UNK'] = 0

    target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])
    
    num_input_tokens = len(input_word2idx)
    num_target_tokens = len(target_word2idx)

    config = dict()
    config['input_word2idx'] = input_word2idx
    config['input_idx2word'] = input_idx2word
    config['target_word2idx'] = target_word2idx
    config['target_idx2word'] = target_idx2word
    config['num_input_tokens'] = num_input_tokens
    config['num_target_tokens'] = num_target_tokens
    config['max_input_seq_length'] = max_input_seq_length
    config['max_target_seq_length'] = max_target_seq_length

    return config

Create an instance of the neural network and train the network on the input data

Reusing a saved model (for prediction purposes only)

Using the test data to validate the prediction of the neural network

In [6]:
print('loading csv file ...')
data_dir_path = "./data/"

model_dir_path = './models/'

df = pd.read_csv(data_dir_path + "train.csv")
Y = df['Summary']
X = df['Full Text']

config = np.load(Seq2SeqSummarizer.get_config_file_path(model_dir_path=model_dir_path), allow_pickle=True).item()

summarizer = Seq2SeqSummarizer(config)
summarizer.load_weights(weight_file_path=Seq2SeqSummarizer.get_weight_file_path(model_dir_path=model_dir_path))

print('start predicting ...')

op_data = pd.DataFrame(columns=['Index', 'Article', 'Original Summary', 'Generated Summary'])

for i in np.random.permutation(np.arange(len(X)))[0:20]:
    x = X[i]
    actual_summary = Y[i]
    gen_summary = summarizer.summarize(x)
    op_data = op_data.append({
        'Index': i, 
        'Article': x, 
        'Original Summary': actual_summary, 
        'Generated Summary': gen_summary
    }, ignore_index = True)
#     print('Article: ', x)
#     print('Generated Summary: ', gen_summary)
#     print('Original Summary: ', actual_summary)

print("Predictions done")
op_data.to_csv("output_summary.csv")

loading csv file ...
Model summary
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None)         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, 500, 100)     500200      encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, None, 2001)   0                                            
__________________________________________________________________________________________________
encoder_lstm (LSTM)             [(None, 100), (None, 80400       encoder_embedding[0][0]          
_________________________________________________________

In [7]:
op_data.head()


Unnamed: 0,Index,Article,Original Summary,Generated Summary
0,45,Hyderabad-based drugmaker Dr Reddy's Laborator...,Indian drugmaker Dr Reddy's Laboratories said ...,former australian batsman matthew hayden has m...
1,70,Patidar leader Hardik Patel on Wednesday rubbi...,Patidar leader Hardik Patel on Wednesday asked...,finance minister arun jaitley has alleged that...
2,6,Peeling onions have seldom been a loving task....,IIT Kharagpur and South Korea-based researcher...,iit kharagpur and south korea-based researcher...
3,88,Developing voice assistant and speech recognit...,US-based Cisco's Executive Chairman John Chamb...,us-based cisco's executive chairman has picked...
4,99,At a time when Flipkart's valuation has been m...,Amazon on Wednesday said that it has more than...,reacting to bihar has said that he should not ...


In [8]:
x="The main purpose of refactoring is to fight technical debt. It transforms a mess into clean code and simple design. The user should initiate the program by running the java code through the command prompt or terminal, depending on the operating system. By running the program, a FilerReader class and the BufferedReader class from java.io package is used to take input frvements depend a lot on the skills of the maintainer. Coupling and cohesion on the other hand are quality attributes which are generally recognizedas being among the most likely quantifiable indicators for software maintainability. Therefore, this paper analyze show refactorings manipulate coupling/cohesion character-istics, and how to identify refactoring opportunities that improve these characteristics. As such we provide practicalguidelines for the optimal usage of refactoring in a software maintenance process. Refactorings behavior preserving source code transformations — allow the automated redistribution of pieces of source code over the class hierarchy. The underlying objective is to improve the quality of the software system,with regard to future maintenance and development activities. Unfortunately, while it is clear that we can use refactorings to restructure software systems, it is unclear how to use them in order to improve specific quality attributes thatare indicators for a good design. We start from the assumption that coupling and cohesion characteristics may serve as indicators for the optimal distribution of responsiblities over the class hierarchies. Thus, rather than saying that refactoring will improve the design, we aim for a less ambitious goal of improving the coupling and cohesion. Cohesion then corresponds to the degree to which elements of a class belong together, and coupling is the strength of association established by a connection from one class to another."

In [13]:
summarizer.summarize("The main purpose of refactoring is to fight technical debt. It transforms a mess into clean code and simple design. The user should initiate the program by running the java code through the command prompt or terminal, depending on the operating system. By running the program, a FilerReader class and the BufferedReader class from java.io package is used to take input frvements depend a lot on the skills of the maintainer. Coupling and cohesion on the other hand are quality attributes which are generally recognizedas being among the most likely quantifiable indicators for software maintainability. Therefore, this paper analyze show refactorings manipulate coupling/cohesion character-istics, and how to identify refactoring opportunities that improve these characteristics. As such we provide practicalguidelines for the optimal usage of refactoring in a software maintenance process. Refactorings behavior preserving source code transformations — allow the automated redistribution of pieces of source code over the class hierarchy. The underlying objective is to improve the quality of the software system,with regard to future maintenance and development activities. Unfortunately, while it is clear that we can use refactorings to restructure software systems, it is unclear how to use them in order to improve specific quality attributes thatare indicators for a good design. We start from the assumption that coupling and cohesion characteristics may serve as indicators for the optimal distribution of responsiblities over the class hierarchies. Thus, rather than saying that refactoring will improve the design, we aim for a less ambitious goal of improving the coupling and cohesion. Cohesion then corresponds to the degree to which elements of a class belong together, and coupling is the strength of association established by a connection from one class to another.")

'former australian batsman matthew has been appointed as the upcoming state assembly and bharat will to their government on the region on all reports that china is maintaining a sizeable number of troops in and others of its from six called called called if called if this will to be'

In [10]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.nlp.tokenizers import Tokenizer

parser = PlaintextParser.from_string(x, Tokenizer('english'))

In [11]:
print ("summary by EdmundsonSummarizer")     
edsummarizer = EdmundsonSummarizer() 
words = ("refactoring", "java" )
edsummarizer.bonus_words = words
     
words = ("another", "and", "some", "next",)
edsummarizer.stigma_words = words
    
     
words = ("another", "and", "some", "next",)
edsummarizer.null_words = words
for sentence in edsummarizer(parser.document, 3):
    print(sentence)     

summary by EdmundsonSummarizer
The main purpose of refactoring is to fight technical debt.
The user should initiate the program by running the java code through the command prompt or terminal, depending on the operating system.
As such we provide practicalguidelines for the optimal usage of refactoring in a software maintenance process.
