### News Articles Summarization with T5-base model

### Importing Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets



In [3]:
!pip install evaluate



In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from sklearn import preprocessing
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
import numpy as np

import evaluate

### Loading Dataset

In [5]:
file_path = '/content/drive/MyDrive/Colab Notebooks/merge_df.csv'

data = pd.read_csv(file_path)

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,File_path,Articles,Summaries
0,0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."


In [8]:
data.rename(columns={'File_path':'Category'}, inplace=True)

In [9]:
data.head()

Unnamed: 0.1,Unnamed: 0,Category,Articles,Summaries
0,0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."


In [10]:
data.shape

(5449, 4)

In [11]:
data['Category'].value_counts()

business         1228
politics         1158
sport            1021
entertainment     925
tech              802
crime             110
lifestyle          78
law                41
sports             30
science            25
technology         18
architecture        4
accidents           4
art                 2
health              2
environment         1
Name: Category, dtype: int64

In [12]:
data = data[data['Category'] == 'business'].reset_index(drop=True)

In [13]:
data['Category'].value_counts()

business    1228
Name: Category, dtype: int64

In [14]:
data.head()

Unnamed: 0.1,Unnamed: 0,Category,Articles,Summaries
0,1714,business,US consumer confidence up..Consumers' confiden...,"Wal-Mart, the largest US retailer, has said it..."
1,1715,business,The 'ticking budget' facing the US..The budget...,Brute force budget cuts or spending caps would...
2,1716,business,Mitsubishi in Peugeot link talks..Trouble-hit ...,Trouble-hit Mitsubishi Motors is in talks with...
3,1717,business,BMW reveals new models pipeline..BMW is prepar...,Typically it takes about three years from when...
4,1718,business,World leaders gather to face uncertainty..More...,"More than 2,000 business and political leaders..."


In [15]:
data.drop(columns=['Unnamed: 0','Category'], inplace=True)

In [16]:
data.head()

Unnamed: 0,Articles,Summaries
0,US consumer confidence up..Consumers' confiden...,"Wal-Mart, the largest US retailer, has said it..."
1,The 'ticking budget' facing the US..The budget...,Brute force budget cuts or spending caps would...
2,Mitsubishi in Peugeot link talks..Trouble-hit ...,Trouble-hit Mitsubishi Motors is in talks with...
3,BMW reveals new models pipeline..BMW is prepar...,Typically it takes about three years from when...
4,World leaders gather to face uncertainty..More...,"More than 2,000 business and political leaders..."


In [17]:
data_train, data_test = train_test_split(data, test_size=0.20, random_state=42)

In [18]:
train_ds = Dataset.from_pandas(data_train, preserve_index=False)
test_ds = Dataset.from_pandas(data_test, preserve_index=False)

In [19]:
articles = DatasetDict()

In [20]:
articles['train'] = train_ds
articles['test'] = test_ds

In [21]:
articles

DatasetDict({
    train: Dataset({
        features: ['Articles', 'Summaries'],
        num_rows: 982
    })
    test: Dataset({
        features: ['Articles', 'Summaries'],
        num_rows: 246
    })
})

### Data Pre-processing

In [22]:
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512

In [23]:
dataset_train = articles['train']
dataset_valid = articles['test']

In [24]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior w

In [25]:
def preprocess_function(examples, tokenizer):
    inputs = [f"summarize: {article}" for article in examples['Articles']]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in examples['Summaries']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=512,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [26]:
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS,
    fn_kwargs={'tokenizer': tokenizer}
)

Map (num_proc=4):   0%|          | 0/982 [00:00<?, ? examples/s]



In [27]:
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS,
    fn_kwargs={'tokenizer': tokenizer}
)

Map (num_proc=4):   0%|          | 0/246 [00:00<?, ? examples/s]



In [28]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [30]:
!pip install rouge_score



In [31]:
rouge = evaluate.load("rouge")

In [41]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)

    # Calculate precision
    precision = precision_score(labels, predictions)

    # Calculate recall
    recall = recall_score(labels, predictions)

    # Calculate F1 score
    f1 = f1_score(labels, predictions)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    # Include additional metrics
    additional_metrics = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

    # Combine the Rouge metrics and additional metrics into a single dictionary
    result.update(additional_metrics)

    # Round all values in the dictionary to 4 decimal places
    return {k: round(v, 4) for k, v in result.items()}


In [33]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [38]:
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1


In [44]:
!pip install accelerate -U



In [47]:
!pip install transformers[torch]



In [49]:
!pip install transformers[torch]>=4.12.0

In [51]:
!pip show accelerate

Name: accelerate
Version: 0.26.1
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


In [34]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0003,
    dataloader_num_workers=4
)

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

In [42]:
history = trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,0.1325,0.309195,0.915,0.8584,0.8992,216.9106
400,0.0795,0.320744,0.9177,0.8625,0.902,216.9106
600,0.0514,0.331799,0.9211,0.8689,0.9052,216.9106
800,0.0649,0.326689,0.9228,0.8732,0.9072,216.9106
1000,0.0642,0.339646,0.9255,0.8776,0.9097,216.9106
1200,0.0528,0.342,0.9267,0.8808,0.9114,216.9106
1400,0.0258,0.351525,0.9278,0.8821,0.9119,216.9106
1600,0.0451,0.358839,0.9282,0.8837,0.9126,216.9106
1800,0.0299,0.364549,0.9284,0.8838,0.9129,216.9106
2000,0.0207,0.36714,0.9298,0.8858,0.9142,216.9106




In [43]:
trainer.evaluate()



{'eval_loss': 0.37215113639831543,
 'eval_rouge1': 0.9295,
 'eval_rouge2': 0.8863,
 'eval_rougeL': 0.9142,
 'eval_gen_len': 216.9106,
 'eval_runtime': 64.5482,
 'eval_samples_per_second': 3.811,
 'eval_steps_per_second': 0.961,
 'epoch': 10.0}

In [46]:
results = trainer.evaluate()





In [48]:
# Extract the additional_metrics from the results
additional_metrics = results.get("additional_metrics", {})
print(additional_metrics)

{}


In [55]:
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/t5-base-business-summarizer")
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/t5-base-business-summarizer.h5")
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/t5-base-business-summarizer.tf")

In [56]:
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/t5-base-business-summarizer-pretrained")