In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from datasets import load_dataset
from rouge import Rouge
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [3]:
ds = load_dataset("cnn_dailymail", "1.0.0")
ds

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [4]:
stopwords = list(STOP_WORDS)
nlp = spacy.load('en_core_web_sm')

In [5]:
def select_main_sentence(text, punctuation, nlp):
    summary_length = 3
    doc = nlp(text)
    tokens = [token.text for token in doc]
    punctuation = punctuation + '\n'
    sentence_tokens = [sent for sent in doc.sents]
    
    word_frequencies = {}
    for word in doc:
        if word.text.lower() not in stopwords:
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1

    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word.text.lower()]
         
    summary = nlargest(summary_length, sentence_scores, key = sentence_scores.get)
    return summary
    
    


In [6]:
from transformers import pipeline

In [12]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small", max_length=1024)
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
model.from_pretrained("./summary/checkpoint-6656")   # load weights if have already trained
print('load trained model and weights')
pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, device=0)


load trained model and weights


In [77]:
rouge = Rouge()
print("\n----------------------------article---------------------------------------\n")
text = ds['train'][200]['article']
print(text)
print("\n----------------------------label---------------------------------------\n")
target = ds['train'][200]['highlights']
print(target)
print("\n----------------------------generate summary---------------------------------------")
summary = select_main_sentence(text, punctuation, nlp)
generate_summary = ""
for each in summary:
    generate_summary = generate_summary + str(each)
print(generate_summary)
print("\n Rouge-L: ", rouge.get_scores(target, generate_summary)[0]['rouge-l'])

print("\n----------------------------generate summary t5 model---------------------------------------")
pip_res = pipe("Generate summary:\n" + text, max_length = 64)
t5_summary = pip_res[0]['generated_text']
print(t5_summary)
print("\n Rouge-L between label and generate summary with t5 model is ", rouge.get_scores(target, t5_summary)[0]['rouge-l'])



----------------------------article---------------------------------------

LONDON, England -- A mother is seeking to have the womb of her severely disabled daughter removed to prevent the 15-year-old from feeling the pain and discomfort of menstruation. Doctors in Britain are now taking legal advice to see if they are permitted to carry out the hysterectomy on Katie Thorpe, who suffers from cerebral palsy. But a charity campaigning for the disabled said on Monday the move could infringe human rights and would set a "disturbing precedent." Andy Rickell, executive director of disability charity Scope, told the Press Association: "It is very difficult to see how this kind of invasive surgery, which is not medically necessary and which will be very painful and traumatic, can be in Katie's best interests. "This case raises fundamental ethical issues about the way our society treats disabled people and the respect we have for disabled people's human and reproductive rights.  Watch why the 