### Step1: Import packages

In [6]:
import torch
import os
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoModelForPreTraining, pipeline


### Step2: Read dataset

In [7]:
ds = load_dataset("cnn_dailymail", "1.0.0")

In [8]:
ds

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [9]:
ds['test'][0]

{'article': '(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday

In [10]:
len(ds['train'])

287113

In [11]:
ds['train'][0]

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [12]:
ds['train'][0]

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

### Step3: Analyze data

In [13]:
torch.cuda.is_available()

True

In [14]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small", max_length=1024)

In [15]:
def process_func(examples):
    contents = ['Generate summary: \n' + e for e in examples['article']]
    inputs = tokenizer(contents, max_length=1024, truncation=True)
    labels = tokenizer(text_target=examples['highlights'], max_length=64, truncation=True)
    inputs['labels'] = labels['input_ids']
    return inputs


In [16]:
tokenized_ds = ds.map(process_func, batched=True)

In [17]:
tokenizer.decode(tokenized_ds['train'][0]['input_ids'])

'Generate summary: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box offic

In [18]:
tokenizer.decode(tokenized_ds['train'][0]['labels'])

"Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday. Young actor says he has no plans to fritter his cash away. Radcliffe's earnings from first five Potter films have been held in trust fund.</s>"

In [19]:
ds['train'][0]['highlights']

"Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday . Young actor says he has no plans to fritter his cash away . Radcliffe's earnings from first five Potter films have been held in trust fund ."

### Step4: Create model

In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small", max_length=1024)

### Step5: Create evaluate function

In [21]:
import numpy as np
from rouge import Rouge

rouge = Rouge()


In [26]:
def compute_metric(evalPred):
    preds, labels = evalPred.predictions, evalPred.label_ids
    decode_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decode_preds = [" ".join(p) for p in decode_preds]
    decode_labels = [" ".join(p) for p in decode_labels]
    scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
    #scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
    return {"rouge-l": scores['rouge-l']['f']}



### Step6: Set training parameters

In [27]:
args = Seq2SeqTrainingArguments(
    output_dir="./summary",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    warmup_steps=128,
    logging_steps=1024,
    logging_dir="./logging",
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=512,
    save_total_limit=3,     # save the last 3 model
    metric_for_best_model="rouge-l",
    predict_with_generate=True,  # must set True
    #load_best_model_at_end=True
)

### Step7: Create trainer

In [28]:
trainer = Seq2SeqTrainer(
    args=args,
    model=model,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    compute_metrics=compute_metric,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

### Step8: Train the model

In [29]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge-l
256,1.9199,1.767691,0.768248


Step,Training Loss,Validation Loss,Rouge-l
256,1.9199,1.767691,0.768248
512,1.9228,1.744241,0.769261


Step,Training Loss,Validation Loss,Rouge-l
256,1.9199,1.767691,0.768248
512,1.9228,1.744241,0.769261
768,1.9145,1.736634,0.770397


Step,Training Loss,Validation Loss,Rouge-l
256,1.9199,1.767691,0.768248
512,1.9228,1.744241,0.769261
768,1.9145,1.736634,0.770397


KeyboardInterrupt: 

In [30]:
pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, device=0)
text = ds['validation'][200]['article']
target = ds['validation'][200]['highlights']
print(target)
print("----------------------------------\n")
pip_res = pipe("Generate summary:\n" + text, max_length = 64)
t5_summary = pip_res[0]['generated_text']
print(t5_summary)
print("----------------------------------\n")
print(" Rouge-L between label and generate summary with t5 model is ", rouge.get_scores(target, t5_summary)[0]['rouge-l'])


Manchester United thrashes Tottenham Hotspur 3-0 . Wayne Rooney celebrates his goal by pretending to be knocked out . Chelsea held 1-1 at home but extends EPL lead at top to six points . Gareth Bale scores twice as Real Madrid keeps pace in La Liga .
----------------------------------
Wayne Rooney scores first half goals to give Tottenham Hotspur 3-0 win . Tottenham are seventh, six points adrift of the top four . Everton ease relegation fears with 3-0 win over Newcastle .
----------------------------------

 Rouge-L between label and generate summary with t5 model is  {'r': 0.32142857142857145, 'p': 0.21951219512195122, 'f': 0.2608695603948751}


### Step8.5: Retrain model if needed

In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained("./summary/last-checkpoint-3072")
print("Model weights loaded...\n")

pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, device=0)
text = ds['validation'][200]['article']
target = ds['validation'][200]['highlights']
print(target)
print("----------------------------------\n")
pip_res = pipe("Generate summary:\n" + text, max_length = 64)
t5_summary = pip_res[0]['generated_text']
print(t5_summary)
print("----------------------------------\n")
print(" Rouge-L between label and generate summary with t5 model is ", rouge.get_scores(target, t5_summary)[0]['rouge-l'])


Token indices sequence length is longer than the specified maximum sequence length for this model (935 > 512). Running this sequence through the model will result in indexing errors


Model weights loaded...

Manchester United thrashes Tottenham Hotspur 3-0 . Wayne Rooney celebrates his goal by pretending to be knocked out . Chelsea held 1-1 at home but extends EPL lead at top to six points . Gareth Bale scores twice as Real Madrid keeps pace in La Liga .
----------------------------------
Wayne Rooney scores first half goals to give Tottenham Hotspur 3-0 win . Marouane Fellaini, Michael Carrick and Rooney score . Everton ease relegation fears with 3-0 win over Newcastle .
----------------------------------

 Rouge-L between label and generate summary with t5 model is  {'r': 0.28, 'p': 0.17073170731707318, 'f': 0.2121212074150598}


In [91]:
validations = ds['validation']
texts: list[str] = validations['article']
labels: list[str] = validations['highlights']
t5_summaries : list[str] = [pipe(each, max_length = 64)[0]['generated_text'] for each in texts]
rouge.get_scores(labels, t5_summaries, avg=True)['rouge-l']

{'r': 0.41089488479762454, 'p': 0.3141787777530955, 'f': 0.346684883166558}

In [70]:
text_generator = pipeline("text2text-generation")

# List of input prompts
input_prompts = [
    "Once upon a time, there was a king who ruled over a prosperous kingdom.",
    "In a galaxy far, far away, a young Jedi embarked on a journey to defeat the Sith.",
    "The scientist conducted an experiment that would change the course of human history."
]

# Generate results for each input prompt
results = [text_generator(prompt) for prompt in input_prompts]

# Print results
for input_prompt, result in zip(input_prompts, results):
    print("Input Prompt:", input_prompt)
    print("Generated Text:", result)
    print()

No model was supplied, defaulted to google-t5/t5-base and revision 686f1db (https://huggingface.co/google-t5/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on google-t5/t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Input Prompt: Once upon a time, there was a king who ruled over a prosperous kingdom.
Generated Text: [{'generated_text': 'a king who ruled over a prosperous kingdom. there was a'}]

Input Prompt: In a galaxy far, far away, a young Jedi embarked on a journey to defeat the Sith.
Generated Text: [{'generated_text': 'a galaxy far, far away., far away. A Jedi. A Jedi'}]

Input Prompt: The scientist conducted an experiment that would change the course of human history.
Generated Text: [{'generated_text': 'a scientist who conducted an experiment that would change the course of human history.'}]


### Step9: Evaluate the model

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from datasets import load_dataset
from rouge import Rouge
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

stopwords = list(STOP_WORDS)
nlp = spacy.load('en_core_web_sm')


In [None]:
def select_main_sentence(text, punctuation, nlp):
    summary_length = 3
    doc = nlp(text)
    tokens = [token.text for token in doc]
    punctuation = punctuation + '\n'
    sentence_tokens = [sent for sent in doc.sents]
    
    word_frequencies = {}
    for word in doc:
        if word.text.lower() not in stopwords:
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1

    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word.text.lower()]
         
    summary = nlargest(summary_length, sentence_scores, key = sentence_scores.get)
    return summary

In [None]:
print("\n----------------------------article---------------------------------------\n")
text = ds['validation'][1400]['article']
print(text)
print("\n----------------------------label---------------------------------------\n")
target = ds['validation'][1400]['highlights']
print(target)
print("\n----------------------------generate summary---------------------------------------")
summary = select_main_sentence(text, punctuation, nlp)
generate_summary = ""
for each in summary:
    generate_summary = generate_summary + str(each)
print(generate_summary)
print(" Rouge-L: ", rouge.get_scores(target, generate_summary)[0]['rouge-l'])

print("\n----------------------------generate summary t5 model---------------------------------------")
pip_res = pipe("Generate summary:\n" + text, max_length = 64)
t5_summary = pip_res[0]['generated_text']
print(t5_summary)
print(" Rouge-L between label and generate summary with t5 model is ", rouge.get_scores(target, t5_summary)[0]['rouge-l'])