In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ivy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  torch.utils._pytree._register_pytree_node(


In [None]:
df = pd.read_csv('merge_df.csv', index_col=0)
df

Unnamed: 0,File_path,Articles,Summaries
0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."
...,...,...,...
5444,accidents,HONG KONG — Hundreds of pilot whales that s...,more than 500 rescuers tried frantically to se...
5445,sports,"NICE, France — Rivère accepts the complim...",Signing balotelli was not just a way to garner...
5446,business,FRANKFURT — Germans who never really warmed...,Although there was no evidence of that the bun...
5447,sports,Charles Oakley has strong feelings about compe...,He questioned why any n. b. a. free agent woul...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5449 entries, 0 to 5448
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   File_path  5449 non-null   object
 1   Articles   5449 non-null   object
 2   Summaries  5449 non-null   object
dtypes: object(3)
memory usage: 170.3+ KB


## Data Preprocessing

#### Split Dataset

In [None]:
X, X_test, y, y_test = train_test_split(df['Articles'], df['Summaries'], test_size=0.2, random_state=42)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# load tokenizer and model
model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
X_train_encodings = tokenizer(X_train.tolist(), max_length=512, truncation=True, padding=True)
X_val_encodings = tokenizer(X_val.tolist(), max_length=512, truncation=True, padding=True)
y_train_encodings = tokenizer(y_train.tolist(), max_length=64, truncation=True, padding=True)
y_val_encodings = tokenizer(y_val.tolist(), max_length=64, truncation=True, padding=True)

In [None]:
import torch
class newsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx]),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx]),
            "labels": torch.tensor(self.labels["input_ids"][idx]),
        }
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = newsDataset(X_train_encodings, y_train_encodings)
val_dataset = newsDataset(X_val_encodings, y_val_encodings)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=True,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.8312,0.685208


TrainOutput(global_step=981, training_loss=0.7581181735195273, metrics={'train_runtime': 598.0086, 'train_samples_per_second': 6.56, 'train_steps_per_second': 1.64, 'total_flos': 2388941804666880.0, 'train_loss': 0.7581181735195273, 'epoch': 1.0})

In [None]:
model.save_pretrained('t5-base-fine-tuned-news')

In [None]:
X_test = X_test.tolist()

AttributeError: 'list' object has no attribute 'tolist'

In [None]:
X_test[0]

'McCririck out of Big Brother show\n\nRacing pundit John McCririck has become the latest contestant to be evicted from Celebrity Big Brother.\n\nHe was nominated to leave the Channel 4 show by fellow housemates, alongside Happy Mondays dancer Bez. At one time Bez was among the most popular contestants but he has since become withdrawn and argumentative. McCririck was ordered to leave the house on Monday, following Jackie Stallone, the actor Sylvester\'s mother, who was first to be evicted. Bez reacted badly to the news that he had been nominated by five of his fellow housemates, whilst John received four votes against him. Sylvester Stallone\'s ex-wife Brigitte Nielsen nominated both John and Bez. She said: "Bez is a difficult human being. There\'s something wrong with him. Even though he\'s making an effort, he\'s not very happy in here." Former Holby City actor Jeremy Edwards said he had nominated Bez after he became agitated on Friday night and talked about escaping over the wall to

In [None]:
y_test

3732    According to bookmaker Ladbrokes, John McCriri...
371     But Ms Short said the effect of the parallel c...
453     "I'm happy Madrid is interested in me because ...
290     Lib Dem Sir Archy Kirkwood, who chairs the Com...
4454    Despite Beijing recent actions which she said ...
                              ...                        
3614    It was named best film while Alexander Payne w...
1186    "One of the problems with video phones is peop...
4553    U.S. Says Russians Were Behind Cyberattacks on...
757     Liverpool manager Rafael Benitez said their qu...
2534    The five unions meeting Mr Prescott want the g...
Name: Summaries, Length: 1090, dtype: object

In [None]:
b = y_test.tolist()

In [None]:
b[0]

'According to bookmaker Ladbrokes, John McCririck was 1/3 favourite to be evicted on Monday while Bez was at 9/4 .He said Bez was being "loopy" and "stressed".He was nominated to leave the Channel 4 show by fellow housemates, alongside Happy Mondays dancer Bez.McCririck was ordered to leave the house on Monday, following Jackie Stallone, the actor Sylvester\'s mother, who was first to be evicted.Bez reacted badly to the news that he had been nominated by five of his fellow housemates, whilst John received four votes against him.At one time Bez was among the most popular contestants but he has since become withdrawn and argumentative.'

In [None]:
data= {
    'article':X_test, 'summary':b
}
test_df = pd.DataFrame(data=data)
test_df.head()

Unnamed: 0,article,summary
0,McCririck out of Big Brother show\n\nRacing pu...,"According to bookmaker Ladbrokes, John McCriri..."
1,Short attacks US over tsunami aid..Former Cabi...,But Ms Short said the effect of the parallel c...
2,Reyes tricked into Real admission..Jose Antoni...,"""I'm happy Madrid is interested in me because ..."
3,CSA chief who 'quit' still in job..The head of...,"Lib Dem Sir Archy Kirkwood, who chairs the Com..."
4,BEIJING — President Tsai of Taiwan sharpl...,Despite Beijing recent actions which she said ...


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device(type='cuda')

In [None]:
model.to(device)
for index, row in test_df.iterrows():
    input_text = row['article']
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(model.device)
    summary_ids = model.generate(input_ids, max_length=150, length_penalty=2.0, num_beams=4, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Store or print the generated summary
    test_df.at[index, 'generated_summary'] = generated_summary

In [None]:
test_df.head()

Unnamed: 0,article,summary,generated_summary
0,McCririck out of Big Brother show\n\nRacing pu...,"According to bookmaker Ladbrokes, John McCriri...",McCririck was ordered to leave the house on Mo...
1,Short attacks US over tsunami aid..Former Cabi...,But Ms Short said the effect of the parallel c...,Former Cabinet minister Clare Short has critic...
2,Reyes tricked into Real admission..Jose Antoni...,"""I'm happy Madrid is interested in me because ...","""If I'm not (playing for Real) I'm going to ha..."
3,CSA chief who 'quit' still in job..The head of...,"Lib Dem Sir Archy Kirkwood, who chairs the Com...","The head of the ""failing"" Child Support Agency..."
4,BEIJING — President Tsai of Taiwan sharpl...,Despite Beijing recent actions which she said ...,President Tsai of Taiwan sharply criticized Ch...


In [None]:
from rouge import Rouge

def evaluate_rouge(hypotheses, references):
    rouge = Rouge()
    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores

In [None]:
model_generated_summaries = test_df['generated_summary'].tolist()
reference_summaries = test_df['summary'].tolist()

rouge_scores = evaluate_rouge(model_generated_summaries, reference_summaries)

print("ROUGE Scores:")
print("ROUGE-N F1 Score:", rouge_scores["rouge-1"]["f"])
print("ROUGE-L F1 Score:", rouge_scores["rouge-l"]["f"])

ROUGE Scores:
ROUGE-N F1 Score: 0.4519763007101942
ROUGE-L F1 Score: 0.4450328824264304


In [None]:
test_df.iloc[1,1]

'But Ms Short said the effect of the parallel coalition would be to undermine the UN.She said only the UN had the "moral authority" to lead the relief work.The US was "very bad at coordinating with anyone" and India had its own problems, Ms Short said.Ms Short said the countries involved could not boast good records on their response to major disasters.Former Cabinet minister Clare Short has criticised the US-led tsunami aid coalition, saying the UN should be leading efforts.'

In [None]:
test_df.iloc[1,2]

'Former Cabinet minister Clare Short has criticised the US-led tsunami aid coalition, saying the UN should be leading efforts.President Bush has announced that an alliance of the US, India, Australia and Japan will co-ordinate a humanitarian drive.I think this initiative from America to set up four countries claiming to co-ordinate sounds like yet another attempt to undermine the UN when it is the best system we have got'