In [1]:
!pip install transformers
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (pyproject.toml) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24987 sha256=6e9c7efed9bd0d3b995fa5aa3017f8b9acad6c668507d99bd402c77c1dc79b47
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [2]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rouge_score import rouge_scorer


In [3]:
BBC_csv = '/content/bbc_new _text_complexity_summarization.csv'
BBC_csv

'/content/bbc_new _text_complexity_summarization.csv'

In [4]:
text_column_name = 'text'

In [5]:
reference_summary_column_name = 'text_rank_summary'

In [6]:
t5_model_name = 't5-small'

In [7]:
df = pd.read_csv(BBC_csv)
df.head()

Unnamed: 0,text,labels,no_sentences,Flesch Reading Ease Score,Dale-Chall Readability Score,text_rank_summary,lsa_summary
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,26,62.17,9.72,It hopes to increase subscribers by offering t...,Its profits were buoyed by one-off gains which...
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,17,65.56,9.09,The dollar has hit its highest level against t...,"""I think the chairman's taking a much more san..."
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,14,69.21,9.66,The owners of embattled Russian oil giant Yuko...,Yukos' owner Menatep Group says it will ask Ro...
3,High fuel prices hit BA's profits\n\nBritish A...,business,24,62.98,9.86,Looking ahead to its full year results to Marc...,"Rod Eddington, BA's chief executive, said the ..."
4,Pernod takeover talk lifts Domecq\n\nShares in...,business,17,70.63,10.23,Reports in the Wall Street Journal and the Fin...,Shares in UK drinks and food firm Allied Domec...


In [8]:
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
print("ROUGE scorer initialized.")

ROUGE scorer initialized.


In [11]:
def summarize_with_t5(text, max_length=150, min_length=30):
    input_text = "summarize: " + str(text)
    inputs = t5_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = t5_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [12]:
def calculate_rouge(generated_summary, reference_summary):
    if pd.isna(generated_summary) or pd.isna(reference_summary):
        return {'rouge1': {'fmeasure': 0.0, 'precision': 0.0, 'recall': 0.0},
                'rouge2': {'fmeasure': 0.0, 'precision': 0.0, 'recall': 0.0},
                'rougeL': {'fmeasure': 0.0, 'precision': 0.0, 'recall': 0.0}}

    generated_summary = str(generated_summary)
    reference_summary = str(reference_summary)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

In [13]:
num_examples_to_process = 5
results = []
t5_rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
print(f"Summarizing and evaluating the first {num_examples_to_process} articles using T5")

Summarizing and evaluating the first 5 articles using T5


In [14]:
for index, row in df.head(num_examples_to_process).iterrows():
    original_text = row[text_column_name]
    reference_summary = row[reference_summary_column_name]

    t5_summary = summarize_with_t5(original_text)
    t5_scores = calculate_rouge(t5_summary, reference_summary)

    for metric in ['rouge1', 'rouge2', 'rougeL']:
        t5_rouge_scores[metric].append(t5_scores[metric].fmeasure)

    results.append({
        'Original Text': original_text,
        'Reference Summary': reference_summary,
        'T5 Summary (Abstractive)': t5_summary,
        'T5 ROUGE-1 F1': t5_scores['rouge1'].fmeasure,
        'T5 ROUGE-2 F1': t5_scores['rouge2'].fmeasure,
        'T5 ROUGE-L F1': t5_scores['rougeL'].fmeasure,
    })
    print(f"--- Article {index + 1} ---")
    print("Original:")
    print(original_text[:300] + "..." if len(original_text) > 300 else original_text)
    print("\nReference Summary:")
    print(reference_summary)
    print("\nT5 Summary (Abstractive):")
    print(t5_summary)
    print("\nROUGE Scores (F1-measure):")
    print(f"  T5: ROUGE-1={t5_scores['rouge1'].fmeasure:.4f}, ROUGE-2={t5_scores['rouge2'].fmeasure:.4f}, ROUGE-L={t5_scores['rougeL'].fmeasure:.4f}")


--- Article 1 ---
Original:
Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and highe...

Reference Summary:
It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue. 

T5 Summary (Abstractive):
time Warner profits jumped 76% to $1.13bn (£600m)

In [15]:
t5_rouge_scores['rouge1']

[0.15714285714285714,
 0.24358974358974358,
 0.4210526315789474,
 0.2797202797202797,
 0.19642857142857145]

In [16]:
avg_t5_rouge1 = sum(t5_rouge_scores['rouge1']) / len(t5_rouge_scores['rouge1'])

In [17]:
avg_t5_rouge2 = sum(t5_rouge_scores['rouge2']) / len(t5_rouge_scores['rouge2'])

In [18]:
avg_t5_rougeL = sum(t5_rouge_scores['rougeL']) / len(t5_rouge_scores['rougeL'])
print(f"Average T5 ROUGE: ROUGE-1={avg_t5_rouge1:.4f}, ROUGE-2={avg_t5_rouge2:.4f}, ROUGE-L={avg_t5_rougeL:.4f}")

Average T5 ROUGE: ROUGE-1=0.2596, ROUGE-2=0.0715, ROUGE-L=0.1757


In [19]:
results_df = pd.DataFrame(results)
print("\n--- Detailed T5 Summarization and Evaluation Results ---")
print(results_df[['Reference Summary', 'T5 Summary (Abstractive)',
                    'T5 ROUGE-1 F1', 'T5 ROUGE-2 F1', 'T5 ROUGE-L F1']].to_string())


--- Detailed T5 Summarization and Evaluation Results ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Reference Summary                                                                                                                                                                                                                                                              T5 Summary (Abstractive)  T5 ROUGE-1 F1  T5 ROUGE-2 F1  T5 ROUGE-L F1
0                     