In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ivy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ivy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ivy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  torch.utils._pytree._register_pytree_node(


In [2]:
df = pd.read_csv('merge_df.csv', index_col=0)
df

Unnamed: 0,File_path,Articles,Summaries
0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."
...,...,...,...
5444,accidents,HONG KONG — Hundreds of pilot whales that s...,more than 500 rescuers tried frantically to se...
5445,sports,"NICE, France — Rivère accepts the complim...",Signing balotelli was not just a way to garner...
5446,business,FRANKFURT — Germans who never really warmed...,Although there was no evidence of that the bun...
5447,sports,Charles Oakley has strong feelings about compe...,He questioned why any n. b. a. free agent woul...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5449 entries, 0 to 5448
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   File_path  5449 non-null   object
 1   Articles   5449 non-null   object
 2   Summaries  5449 non-null   object
dtypes: object(3)
memory usage: 170.3+ KB


#### Data Preprocessing

In [3]:
# Add Prefix 'summarization' to article
prefix = 'summarization: '
df['clean_txt'] = df['Articles'].apply(lambda x: prefix + str(x))

In [4]:
df.head()

Unnamed: 0,File_path,Articles,Summaries,clean_txt
0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...,summarization: Budget to set scene for electio...
1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ...",summarization: Army chiefs in regiments decisi...
2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...,summarization: Howard denies split over ID car...
3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...,summarization: Observers to monitor UK electio...
4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g...",summarization: Kilroy names election seat targ...


In [5]:
df.iloc[0,-1]

'summarization: Budget to set scene for election..Gordon Brown will seek to put the economy at the centre of Labour\'s bid for a third term in power when he delivers his ninth Budget at 1230 GMT. He is expected to stress the importance of continued economic stability, with low unemployment and interest rates. The chancellor is expected to freeze petrol duty and raise the stamp duty threshold from £60,000. But the Conservatives and Lib Dems insist voters face higher taxes and more means-testing under Labour...Treasury officials have said there will not be a pre-election giveaway, but Mr Brown is thought to have about £2bn to spare...- Increase in the stamp duty threshold from £60,000. - A freeze on petrol duty. - An extension of tax credit scheme for poorer families. - Possible help for pensioners The stamp duty threshold rise is intended to help first time buyers - a likely theme of all three of the main parties\' general election manifestos. Ten years ago, buyers had a much greater ch

#### Split Dataset

In [6]:
X, X_test, y, y_test = train_test_split(df['clean_txt'], df['Summaries'], test_size=0.2, random_state=42)

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [8]:
# load tokenizer and model
model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


#### Use GPU if available

In [9]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
device

device(type='cuda')

In [11]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

#### Tokenize data and create PyTorch dataset

In [12]:
X_train_encodings = tokenizer(X_train.tolist(), max_length=512, truncation=True, padding=True)
X_val_encodings = tokenizer(X_val.tolist(), max_length=512, truncation=True, padding=True)
y_train_encodings = tokenizer(y_train.tolist(), max_length=150, truncation=True, padding=True)
y_val_encodings = tokenizer(y_val.tolist(), max_length=150, truncation=True, padding=True)

In [13]:
import torch
import time
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer

class newsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx]),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx]),
            "labels": torch.tensor(self.labels["input_ids"][idx]),
        }
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])
    
train_dataset = newsDataset(X_train_encodings, y_train_encodings)
val_dataset = newsDataset(X_val_encodings, y_val_encodings)

#### Define Hyperparameters and Train model

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    fp16=True,
)

In [15]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

In [16]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.2117,0.705703
2,0.6729,0.678294
3,0.6567,0.671496


TrainOutput(global_step=2943, training_loss=0.7709842620963674, metrics={'train_runtime': 2241.4642, 'train_samples_per_second': 5.251, 'train_steps_per_second': 1.313, 'total_flos': 7166825414000640.0, 'train_loss': 0.7709842620963674, 'epoch': 3.0})

#### Save weights

In [17]:
torch.save(model.state_dict(), 't5-base_fine_tuned_news_weights.pth')

In [18]:
from transformers import T5ForConditionalGeneration
import torch
# load the model with saved weights
model_name = 't5-base'
tokenizer2 = AutoTokenizer.from_pretrained(model_name)
model2 = T5ForConditionalGeneration.from_pretrained(model_name)
model2.load_state_dict(torch.load('t5-base_fine_tuned_news_weights.pth'))

<All keys matched successfully>

In [20]:
X_test.tolist()

['summarization: McCririck out of Big Brother show\n\nRacing pundit John McCririck has become the latest contestant to be evicted from Celebrity Big Brother.\n\nHe was nominated to leave the Channel 4 show by fellow housemates, alongside Happy Mondays dancer Bez. At one time Bez was among the most popular contestants but he has since become withdrawn and argumentative. McCririck was ordered to leave the house on Monday, following Jackie Stallone, the actor Sylvester\'s mother, who was first to be evicted. Bez reacted badly to the news that he had been nominated by five of his fellow housemates, whilst John received four votes against him. Sylvester Stallone\'s ex-wife Brigitte Nielsen nominated both John and Bez. She said: "Bez is a difficult human being. There\'s something wrong with him. Even though he\'s making an effort, he\'s not very happy in here." Former Holby City actor Jeremy Edwards said he had nominated Bez after he became agitated on Friday night and talked about escaping 

In [21]:
X_test_list = X_test.tolist()

In [22]:
X_test_list[0]

'summarization: McCririck out of Big Brother show\n\nRacing pundit John McCririck has become the latest contestant to be evicted from Celebrity Big Brother.\n\nHe was nominated to leave the Channel 4 show by fellow housemates, alongside Happy Mondays dancer Bez. At one time Bez was among the most popular contestants but he has since become withdrawn and argumentative. McCririck was ordered to leave the house on Monday, following Jackie Stallone, the actor Sylvester\'s mother, who was first to be evicted. Bez reacted badly to the news that he had been nominated by five of his fellow housemates, whilst John received four votes against him. Sylvester Stallone\'s ex-wife Brigitte Nielsen nominated both John and Bez. She said: "Bez is a difficult human being. There\'s something wrong with him. Even though he\'s making an effort, he\'s not very happy in here." Former Holby City actor Jeremy Edwards said he had nominated Bez after he became agitated on Friday night and talked about escaping o

In [23]:
y_test

3732    According to bookmaker Ladbrokes, John McCriri...
371     But Ms Short said the effect of the parallel c...
453     "I'm happy Madrid is interested in me because ...
290     Lib Dem Sir Archy Kirkwood, who chairs the Com...
4454    Despite Beijing recent actions which she said ...
                              ...                        
3614    It was named best film while Alexander Payne w...
1186    "One of the problems with video phones is peop...
4553    U.S. Says Russians Were Behind Cyberattacks on...
757     Liverpool manager Rafael Benitez said their qu...
2534    The five unions meeting Mr Prescott want the g...
Name: Summaries, Length: 1090, dtype: object

In [24]:
y_test_list = y_test.tolist()

In [25]:
y_test_list[0]

'According to bookmaker Ladbrokes, John McCririck was 1/3 favourite to be evicted on Monday while Bez was at 9/4 .He said Bez was being "loopy" and "stressed".He was nominated to leave the Channel 4 show by fellow housemates, alongside Happy Mondays dancer Bez.McCririck was ordered to leave the house on Monday, following Jackie Stallone, the actor Sylvester\'s mother, who was first to be evicted.Bez reacted badly to the news that he had been nominated by five of his fellow housemates, whilst John received four votes against him.At one time Bez was among the most popular contestants but he has since become withdrawn and argumentative.'

In [44]:
data= {
    'article':X_test_list, 'summary':y_test_list
}
test_df = pd.DataFrame(data=data)
test_df.head()

Unnamed: 0,article,summary
0,summarization: McCririck out of Big Brother sh...,"According to bookmaker Ladbrokes, John McCriri..."
1,summarization: Short attacks US over tsunami a...,But Ms Short said the effect of the parallel c...
2,summarization: Reyes tricked into Real admissi...,"""I'm happy Madrid is interested in me because ..."
3,summarization: CSA chief who 'quit' still in j...,"Lib Dem Sir Archy Kirkwood, who chairs the Com..."
4,summarization: BEIJING — President Tsai o...,Despite Beijing recent actions which she said ...


In [46]:
# only the first 10 samples from test dataset are being tested due to time and GPU constraints
test_sample = test_df.iloc[0:10, :]
test_sample

Unnamed: 0,article,summary
0,summarization: McCririck out of Big Brother sh...,"According to bookmaker Ladbrokes, John McCriri..."
1,summarization: Short attacks US over tsunami a...,But Ms Short said the effect of the parallel c...
2,summarization: Reyes tricked into Real admissi...,"""I'm happy Madrid is interested in me because ..."
3,summarization: CSA chief who 'quit' still in j...,"Lib Dem Sir Archy Kirkwood, who chairs the Com..."
4,summarization: BEIJING — President Tsai o...,Despite Beijing recent actions which she said ...
5,summarization: MPs tout Lords replacement plan...,Their plan would see the House of Lords being ...
6,summarization: Reaction from Spanish press..En...,"Aragones' ""no comment"" to questions about raci..."
7,summarization: 'No re-draft' for EU patent law...,A proposed European law on software patents wi...
8,summarization: The sports landscape is littere...,The sports landscape is littered with failed p...
9,summarization: Bryan twins keep US hopes alive...,When Robredo dropped serve in the opening game...


In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [47]:
model2.to(device)
for index, row in test_sample.iterrows():
    input_text = row['article']
    input_ids = tokenizer2.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(model2.device)
    summary_ids = model2.generate(input_ids, max_length=150, length_penalty=2.0, num_beams=4, early_stopping=True)
    generated_summary = tokenizer2.decode(summary_ids[0], skip_special_tokens=True)

    # Store or print the generated summary
    test_sample.at[index, 'generated_summary'] = generated_summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_new.at[index, 'generated_summary'] = generated_summary


In [48]:
test_sample.head()

Unnamed: 0,article,summary,generated_summary
0,summarization: McCririck out of Big Brother sh...,"According to bookmaker Ladbrokes, John McCriri...",Racing pundit John McCririck has become the la...
1,summarization: Short attacks US over tsunami a...,But Ms Short said the effect of the parallel c...,Former Cabinet minister Clare Short has critic...
2,summarization: Reyes tricked into Real admissi...,"""I'm happy Madrid is interested in me because ...","Before the story surfaced, Reyes had moved to ..."
3,summarization: CSA chief who 'quit' still in j...,"Lib Dem Sir Archy Kirkwood, who chairs the Com...","The head of the ""failing"" Child Support Agency..."
4,summarization: BEIJING — President Tsai o...,Despite Beijing recent actions which she said ...,President Tsai of Taiwan sharply criticized Ch...


In [51]:
test_new.to_csv('test_generated_summaries.csv', index=False)
test_results = pd.read_csv('test_generated_summaries.csv', index_col=None)
test_results.head()

Unnamed: 0,article,summary,generated_summary
0,summarization: McCririck out of Big Brother sh...,"According to bookmaker Ladbrokes, John McCriri...",Racing pundit John McCririck has become the la...
1,summarization: Short attacks US over tsunami a...,But Ms Short said the effect of the parallel c...,Former Cabinet minister Clare Short has critic...
2,summarization: Reyes tricked into Real admissi...,"""I'm happy Madrid is interested in me because ...","Before the story surfaced, Reyes had moved to ..."
3,summarization: CSA chief who 'quit' still in j...,"Lib Dem Sir Archy Kirkwood, who chairs the Com...","The head of the ""failing"" Child Support Agency..."
4,summarization: BEIJING — President Tsai o...,Despite Beijing recent actions which she said ...,President Tsai of Taiwan sharply criticized Ch...


#### Evaluation Metrics - Rouge

In [52]:
from rouge import Rouge

def evaluate_rouge(hypotheses, references):
    rouge = Rouge()
    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores

In [54]:
model_generated_summaries = test_results['generated_summary'].tolist()
reference_summaries = test_results['summary'].tolist()

rouge_scores = evaluate_rouge(model_generated_summaries, reference_summaries)

print("ROUGE Scores:")
print("ROUGE-N F1 Score:", rouge_scores["rouge-1"]["f"])
print("ROUGE-L F1 Score:", rouge_scores["rouge-l"]["f"])

ROUGE Scores:
ROUGE-N F1 Score: 0.5605940750735449
ROUGE-L F1 Score: 0.5543454447084836


In [57]:
# hand written summary
test_results.iloc[1,1]

'But Ms Short said the effect of the parallel coalition would be to undermine the UN.She said only the UN had the "moral authority" to lead the relief work.The US was "very bad at coordinating with anyone" and India had its own problems, Ms Short said.Ms Short said the countries involved could not boast good records on their response to major disasters.Former Cabinet minister Clare Short has criticised the US-led tsunami aid coalition, saying the UN should be leading efforts.'

In [58]:
# generated summary
test_results.iloc[1,2]

'Former Cabinet minister Clare Short has criticised the US-led tsunami aid coalition, saying the UN should be leading efforts."I think this initiative from America to set up four countries claiming to co-ordinate sounds like yet another attempt to undermine the UN when it is the best system we have got and the one that needs building up," she said."Only really the UN can do that job," she told BBC Radio Four\'s PM programme.Ms Short said the countries involved could not boast good records on their response to major disasters.But Ms Short said the effect of the parallel coalition would be to undermine the UN.'