In [11]:
import time
import pandas as pd
from sqlalchemy import create_engine

import torch
from summarizer import Summarizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from src.config import Config

## Load data from database

In [4]:
# load data from database
engine = create_engine("sqlite:///" + Config.FILES["DATABASE_DIR"])
df = pd.read_sql_table("Text_table", engine)

# display loaded dataframe
df.size

10

## Model

In [21]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [12]:
def ext_sum(text, ratio=0.8):
    """
    Generate extractive summary using BERT model
    
    INPUT:
    text - str. Input text
    ratio - float. Enter a ratio between 0.1 - 1.0 [default = 0.8]
            (ratio = summary length / original text length)
    
    OUTPUT:
    summary - str. Generated summary
    """
    bert_model = Summarizer()
    summary = bert_model(text, ratio=ratio)
    
    return summary


def abs_sum(text, model, tokenizer, min_length=80, 
                     max_length=150, length_penalty=15, 
                     num_beams=2):
    """
    Generate abstractive summary using T5 model
    
    INPUT:
    text - str. Input text
    model - model name
    tokenizer - model tokenizer
    min_length - int. The min length of the sequence to be generated
                      [default = 80]
    max_length - int. The max length of the sequence to be generated 
                      [default = 150]
    length_penalty - float. Set to values < 1.0 in order to encourage the model 
                     to generate shorter sequences, to a value > 1.0 in order to 
                     encourage the model to produce longer sequences.
                     [default = 15]
    num_beams - int. Number of beams for beam search. 1 means no beam search
                     [default = 2]
    
    OUTPUT:
    summary - str. Generated summary
    """
    tokens_input = tokenizer.encode("summarize: "+text, return_tensors='pt', 
                                    # model tokens max input length
                                    max_length=tokenizer.model_max_length, 
                                    truncation=True)
    
    summary_ids = model.generate(tokens_input,
                                min_length=min_length,
                                max_length=max_length,
                                length_penalty=length_penalty, 
                                num_beams=num_beams)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        
    return summary


def generate_summary(text, model, tokenizer, ext_ratio=1.0, min_length=80, 
                     max_length=150, length_penalty=15, 
                     num_beams=2):
    """
    Generate summary for using extractive & abstractive methods
    
    INPUT:
    text - str. Input text
    model - model name
    tokenizer - model tokenizer
    ext_ratio - float. Enter a ratio between 0.1 - 1.0 [default = 1.0]
                (ratio = summary length / original text length)
                1.0 means no extractive summarization is performed before 
                abstractive summarization
    min_length - int. The min length of the sequence to be generated
                 [default = 80]
    max_length - int. The max length of the sequence to be generated 
                 [default = 150]
    length_penalty - float. Set to values < 1.0 in order to encourage the model 
                     to generate shorter sequences, to a value > 1.0 in order to 
                     encourage the model to produce longer sequences.
                     [default = 15]
    num_beams - int. Number of beams for beam search. 1 means no beam search
                     [default = 2]
    
    OUTPUT:
    summary - str. Generated summary
    """
    if ext_ratio == 1.0:
        summary = abs_sum(text, model, tokenizer, min_length, 
                       max_length, length_penalty, num_beams)
    elif ext_ratio < 1.0:
        text = ext_sum(text, ratio = ext_ratio)
        summary = abs_sum(text, model, tokenizer, min_length, 
                       max_length, length_penalty, num_beams)
    else:
        print('Error! Please enter ext_ratio betwen 0.1 and 1.0')
        
    return summary

In [8]:
def gen_sum_save_monitor(df, model, tokenizer, output_folder, ext_ratio=1.0, 
                         min_length=80, max_length=150, length_penalty=15, 
                         num_beams=2):
    """
    Monitor progress while generating summary & save output to list & text file
    
    INPUT:
    df - DataFrama. Data loaded from database
    model - model name
    tokenizer - model tokenizer
    output_folder - str. Folder name to save the generated output in text file
    ext_ratio - float. Enter a ratio between 0.1 - 1.0 [default = 1.0]
                (ratio = summary length / original text length)
                1.0 means no extractive summarization is performed before 
                abstractive summarization
    min_length - int. The min length of the sequence to be generated
                 [default = 80]
    max_length - int. The max length of the sequence to be generated
                 [default = 150]
    length_penalty - float. Set to values < 1.0 in order to encourage the model 
                     to generate shorter sequences, to a value > 1.0 in order to 
                     encourage the model to produce longer sequences.
                     [default = 15]
    num_beams - int. Number of beams for beam search. 1 means no beam search 
                [default = 2]
    
    OUTPUT:
    summaries - list. Generated summary appended to a list
    """
    summaries = []
    for i in range(len(df)):
        file_path = df.file_path[i]
        raw_text = df.raw_text[i]
    
        start = time.time()
        summary = generate_summary(raw_text, model, tokenizer, 
                                   ext_ratio, min_length, max_length, 
                                   length_penalty, num_beams)
        
        file_name = file_path[4:][:-4]+'_summary.txt'
        
        with open(output_folder + "/" + file_name, 'w')as text_file:
            text_file.write(summary)
        
        
        summaries.append(summary)
        end = time.time()
        print(" Summarized '{}'[time: {:.2f}s]".format(file_path, 
                                                       end-start))
        
    return summaries

In [13]:
summaries = gen_sum_save_monitor(df, model, tokenizer, output_folder=Config.FILES["PREPROCESS_DIR"], ext_ratio=1)

 Summarized 'circle-of-life-hospice.pdf'[time: 8.59s]
 Summarized 'Concord Regional VNA Systems Success Story.pdf'[time: 8.14s]
 Summarized 'first-choice-home-health-and-hospice.pdf'[time: 7.99s]
 Summarized 'Maple Knoll Communities success story.pdf'[time: 8.57s]
 Summarized 'willow-health.pdf'[time: 8.35s]


In [14]:
df["summary"] = summaries

In [9]:
# view the generated summary
for i in range(len(df)):
    print("File path: {}".format(df.file_path[i]))
    print("")
    print('Summary:')
    print("---------")
    print(df.summary[i])
    print("\n")

File path: pdf/Maple Knoll Communities success story.pdf

Summary:
---------
Maple Knoll Communities has been delivering care for over 172 years. the organization is by no means encumbered by processes of the past. Maple Knoll believes in the power of innovation and that technology can, and does, make life easier for residents, staff and families. with Netsmart Telehealth, Maple Knoll can improve clinical satisfaction. with virtual visits and improve physician. & resident satisfaction. the organization has been delivering care for over 172 years, the organization is by no means 


File path: pdf/circle-of-life-hospice.pdf

Summary:
---------
Circle of Life Hospice is the largest non-profit hospice in northwest Arkansas. Circle of Life is committed to compassionate end-of-life care for a person’s body, mind, spirit and family when there is no longer a cure. the hospital readmission rate for patients admitted to Circle of Life in 2017 was 0.5 percent. the organization’s core values of co

In [16]:
engine = create_engine("sqlite:///" + Config.FILES["DATABASE_DIR"])
df.to_sql("Text_table", engine, if_exists="replace", index=False)

5