In [1]:
!pip install transformers nltk rouge sentencepiece



In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
import sentencepiece

In [3]:
def summarize_with_t5(text, model, tokenizer, max_input_length=512, max_output_length=150):
    # Tokenizing and truncating the input text
    input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_length, truncation=True)

    # Generating the summary
    summary_ids = model.generate(input_ids, max_length=max_output_length, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decoding the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary


In [5]:
def calculate_rouge_scores(reference, hypothesis):
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis, reference)
    return scores

In [4]:
model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
paragraph_text  = """
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language. It enables computers to understand, interpret, and generate human-like text. NLP involves several challenges, including natural language understanding, language generation, and language translation. Text summarization is a specific NLP task that involves reducing the length of a document while retaining its key information. There are two main approaches to text summarization: extractive and abstractive. Extractive methods select important sentences from the original text, while abstractive methods generate new sentences to form the summary.
"""

In [7]:
summary = summarize_with_t5(paragraph_text, model, tokenizer)


In [8]:
print("Original Text:\n", paragraph_text)
print("\nSummarized Text:\n", summary)

Original Text:
 
Natural language processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language. It enables computers to understand, interpret, and generate human-like text. NLP involves several challenges, including natural language understanding, language generation, and language translation. Text summarization is a specific NLP task that involves reducing the length of a document while retaining its key information. There are two main approaches to text summarization: extractive and abstractive. Extractive methods select important sentences from the original text, while abstractive methods generate new sentences to form the summary.


Summarized Text:
 natural language processing (NLP) is a field of artificial intelligence. it enables computers to understand, interpret, and generate human-like text. extractive methods select important sentences from the original text. abstractive methods generate new senten

In [9]:
rouge_scores = calculate_rouge_scores(paragraph_text, summary)

In [10]:
print("\nROUGE Scores:\n", rouge_scores)


ROUGE Scores:
 [{'rouge-1': {'r': 0.4492753623188406, 'p': 0.96875, 'f': 0.6138613818096266}, 'rouge-2': {'r': 0.3333333333333333, 'p': 0.8378378378378378, 'f': 0.4769230728508876}, 'rouge-l': {'r': 0.4492753623188406, 'p': 0.96875, 'f': 0.6138613818096266}}]
