# Extractive Text Summarization

Models used: BERTSUM and TextRank

Libraries and packages required for this notebook

In [6]:
# BERTSUM
from summarizer import Summarizer
# Text Rank
from summa import summarizer

# Visualization
from ipywidgets import interact, interactive, fixed, interact_manual, HBox, Layout, VBox, Text
import ipywidgets as widgets

# Extract full text from URL
from newspaper import fulltext
import requests

# Validate URL
import validators

# ROUGE score
from rouge_metric import PyRouge

# Used to create a pickle file
import pickle

BERTSUM

In [2]:
def bert_sum(input_val, max_ratio):
    """
    Bert Extractive Model for text summarization, see https://pypi.org/project/bert-extractive-summarizer/.
    IN: input_val (string), max lentgh (int)
    OUT: summary (string)
    """
    model = Summarizer()
    if max_ratio == 0:
        result = model(input_val)
    else:
        result = model(input_val, ratio=max_ratio)
    summary = "".join(result)
    print('BERT')
    print(summary)
    print()
    return summary

Text Rank

In [3]:
def text_rank(input_val, max_ratio):
    '''
    Text rank extractive text summarization, see https://pypi.org/project/summa/.    
    IN: input_val (string), max lentgh (int)
    OUT: summary (string)
    '''
    if max_ratio == 0:
        result = summarizer.summarize(input_val)
    else:
        result = summarizer.summarize(input_val, ratio=max_ratio)
    print('Text Rank')
    print(result)
    print()
    return result

In [4]:
def summarize(input_val, max_ratio):
    '''
    Calls the summarizers to summarize input data.
    IN: data (string), max lentgh (int)
    OUT: - 
    '''
    bert_summary = bert_sum(input_val, max_ratio)
    print('------------------------------------------------------------------------------------------')
    text_rank_summary = text_rank(input_val, max_ratio)

In [5]:
pickle.dump(summarize, open('model/summarizers.pickle', 'wb'))

In [6]:
pickle.dump(bert_sum, open('model/bert_sum.pickle', 'wb'))

 Recall-Oriented Understudy for Gisting Evaluation (ROUGE) score 

In [18]:
def rouge_score(hyp, ref):
    '''
    Computes the ROUGE score, see https://pypi.org/project/rouge-metric/.
    IN: hypothesis which is the generated summary from the model, reference summary which is the golden summary (strings)
    OUT: ROUGE scores (double)
    '''
    hypotheses = [hyp]

    references = [[ref]]

    # Evaluate document-wise ROUGE scores
    rouge = PyRouge(rouge_n=(1, 2, 4), 
                rouge_l=True, 
                rouge_w=True,
                rouge_w_weight=1.2, 
                rouge_s=True, 
                rouge_su=True, 
                skip_gap=4)

    scores = rouge.evaluate(hypotheses, references)
    for pair in scores.items():
        print('------------------------------------------------------------------------------------------')
        print(pair)
    print('------------------------------------------------------------------------------------------')

General functions 

In [5]:
def full_text_url(input_url):
    """
    Convert article from url to a full text, see https://newspaper.readthedocs.io/en/latest/.
    IN: URL (string)
    OUT: full article (string)
    """
    full_text = fulltext(requests.get(input_url).text)
    return full_text

def valid_url(input_url):
    """
    Check if the url entered is valid.
    IN: URL (string)
    OUT: boolean
    """
    valid = validators.url(input_url)
    if valid == True:
        return True
    else:
        return False
    
def max_length(max_value):
    '''
    Extracts max length as a ratio for the summary.
    IN: max value entered in the field (string)
    OUT: max ratio (float)
    '''
    print('Max function')
    if not max_value == '0-100(%) of original text...' and not max_value:
        print('if not')
        max_ratio = float(max_value)/100
    else: 
        print('else')
        max_ratio = 0.0
    return max_ratio


Visualization using ipywidgets

In [49]:
# defining and displayes the image for the visualization
repoimage = widgets.Image(
    value=open(r"summary.PNG", "rb").read(),
    format='PNG',
    width=900,
    height=50,
)

display(repoimage)

# creates buttons 
button = widgets.Button(description="Summarize", button_style='success', layout={'left': '550px'})
output = widgets.Output(layout={'border': '1px solid black'})
text = widgets.Textarea(value='Type here...', placeholder='Type something', description='Text:',
                        disabled=False, layout={'width': '700px', 'height': '200px'})
text_url = widgets.Textarea(value='Type here...', placeholder='Type something', description='Text:',
                        disabled=False, layout={'width': '700px', 'height': '40px'})
#NEW
max_length_ratio = widgets.Textarea(value='0-100(%) of original text...', placeholder='Type something', description='Max length:',
                        disabled=False, layout={'width': '300px', 'height': '40px'})

file_upload = widgets.FileUpload(accept='', multiple=False)


def select(input):
    '''
    Returns the appearence depending on what button has been selected.
    IN: selected option
    OUT: -
    '''
    if input == 'Text':
        print("Input Text: ")
        display(text, max_length_ratio, button, output)
    elif(input == 'URL'):
        print("Input URL: ")
        display(text_url, max_length_ratio, button, output)
    else:
        print("Input Doc: ")
        display(file_upload, max_length_ratio, button, output)
        

# creates the selction buttons
interact(select, input=widgets.RadioButtons(options=['Text', 'URL', 'Document'],
                                            value='Text',
                                            description='Input Type:',
                                            disabled=False)
)  

@output.capture(clear_output=True)
def on_button_clicked(b):
    '''
    Returns the summary after a selection has been made and text has been given.
    IN: button clicked
    OUT: - 
    '''
    with output:
        #print(text.value)
        if len(file_upload.value) > 0:
            print('Downloaded document')
            with open("output.txt", "w+b") as i:
                i.write(file_upload.data[0])
            doc_text=open('output.txt','r').read()
    
            max_ratio = max_length(max_length_ratio.value)
            summarize(doc_text, max_ratio)
            
        elif valid_url(text_url.value) == True:
            print('URL', type(text_url.value))
            full_text = full_text_url(text_url.value)
            
            max_ratio = max_length(max_length_ratio.value)
            summarize(full_text, max_ratio)
            
        else:            
            print('Text')
            print('max ratio', max_length_ratio.value, type(max_length_ratio.value))
            max_ratio = max_length(max_length_ratio.value)
            summarize(text.value, max_ratio)

button.on_click(on_button_clicked)

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x04#\x00\x00\x00z\x08\x06\x00\x00\x00\x85\x81\x99\xe…

interactive(children=(RadioButtons(description='Input Type:', options=('Text', 'URL', 'Document'), value='Text…