In [20]:
! pip install -U accelerate
! pip install -U transformers
! pip install torch
! pip install datasets
! pip install tqdm
! pip install scikit-learn
! pip install evaluate
! pip install rouge_score

import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TrainingArguments
import evaluate
from datasets import Dataset, DatasetDict
from transformers import Trainer

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=5f614ca9b1930e417521bd85657b32de518945bd562bfda79593739a43d09b31
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [21]:
df = pd.read_csv('/kaggle/input/news-summarization-dataset-for-deep-learning-1/merge_df.csv', index_col=0)
df

Unnamed: 0,File_path,Articles,Summaries
0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."
...,...,...,...
5444,accidents,HONG KONG — Hundreds of pilot whales that s...,more than 500 rescuers tried frantically to se...
5445,sports,"NICE, France — Rivère accepts the complim...",Signing balotelli was not just a way to garner...
5446,business,FRANKFURT — Germans who never really warmed...,Although there was no evidence of that the bun...
5447,sports,Charles Oakley has strong feelings about compe...,He questioned why any n. b. a. free agent woul...


In [22]:
filtered_df = df[df['File_path'].isin(['sports', 'sport'])]

In [23]:
filtered_df

Unnamed: 0,File_path,Articles,Summaries
417,sport,Hodges announces rugby retirement..Scarlets an...,"The 36-year-old, who has 54 caps, was Llanelli..."
418,sport,Bomb threat at Bernabeu stadium..Spectators we...,Spectators were evacuated from Real Madrid's B...
419,sport,Parmar ruled out of Davis Cup tie..A knee inju...,The unheralded Sherwood was the surprise inclu...
420,sport,Benitez issues warning to Gerrard..Liverpool m...,"Benitez responded: ""I spoke to Steven and said..."
421,sport,Officials respond in court row..Australian ten...,"Hewitt said he had had a ""gutful"" of trying to..."
...,...,...,...
5404,sports,"Many of the world’s winter athletes, now prepa...",as olympic officials review the evidence deliv...
5411,sports,The sale of a major league sports team always ...,would fans or sponsors boycott or embrace the ...
5436,sports,"Welcome to Our Picks, a guide to the best stuf...",enjoy them for the first time when you have so...
5445,sports,"NICE, France — Rivère accepts the complim...",Signing balotelli was not just a way to garner...


In [24]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1051 entries, 417 to 5447
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   File_path  1051 non-null   object
 1   Articles   1051 non-null   object
 2   Summaries  1051 non-null   object
dtypes: object(3)
memory usage: 32.8+ KB


## Data Preprocessing

In [25]:
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 8
OUT_DIR = 'results_t5base'
MAX_LENGTH = 256

#### Split Dataset

In [26]:
data_train, data_test = train_test_split(filtered_df, test_size=0.10, random_state=42)

In [27]:
!pip install datasets



In [28]:

train_ds = Dataset.from_pandas(data_train, preserve_index=False)
test_ds = Dataset.from_pandas(data_test, preserve_index=False)

In [29]:
articles = DatasetDict()

In [30]:
articles['train'] = train_ds
articles['test'] = test_ds

## Tokenizer

In [31]:
dataset_train = articles['train']
dataset_valid = articles['test']

In [32]:
# load tokenizer and model
model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [33]:
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['Articles']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in examples['Summaries']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [34]:
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]



In [35]:
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

        

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]



In [36]:
from transformers import T5ForConditionalGeneration
import torch

model = T5ForConditionalGeneration.from_pretrained('t5-base')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [37]:
import evaluate
rouge = evaluate.load("rouge")

In [38]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [39]:
def preprocess_logits_for_metrics(logits, labels):

    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [40]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0002,
    dataloader_num_workers=4
)

In [41]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

In [42]:
history = trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,1.0427,1.029591,0.7731,0.6353,0.7326,185.4528
400,0.7871,0.919268,0.7862,0.6539,0.751,185.4528
600,0.862,0.842314,0.7991,0.6683,0.7651,185.4528
800,0.551,0.771776,0.8134,0.6871,0.7824,185.4528
1000,0.3928,0.732438,0.825,0.7047,0.7972,185.4528
1200,0.4676,0.687911,0.8343,0.7183,0.8086,185.4528
1400,0.3666,0.65865,0.8404,0.7299,0.8173,185.4528
1600,0.4203,0.639894,0.8455,0.7394,0.8241,185.4528
1800,0.5121,0.628516,0.8492,0.7451,0.829,185.4528


In [43]:
trainer.evaluate()

{'eval_loss': 0.6282171010971069,
 'eval_rouge1': 0.8491,
 'eval_rouge2': 0.7449,
 'eval_rougeL': 0.8286,
 'eval_gen_len': 185.4528,
 'eval_runtime': 4.6893,
 'eval_samples_per_second': 22.605,
 'eval_steps_per_second': 5.758,
 'epoch': 8.0}

In [44]:
model.save_pretrained("/kaggle/working/t5-sports-summarizer")

In [45]:
trainer.save_model("/kaggle/working/t5-sports-summarizer")