# News Articles Summarization with Distilled Bert

### Setting up the environment

In [None]:
! pip install evaluate



In [None]:
! pip install rouge_score



### Importing Libraries

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn import preprocessing
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
import numpy as np
from datasets import load_dataset
import evaluate

### Loading Dataset

In [None]:
data = pd.read_csv('/kaggle/input/news-articles-summary/merge_df.csv')

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,File_path,Articles,Summaries
0,0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."


In [None]:
data.rename(columns={'File_path':'Category'}, inplace=True)

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,Category,Articles,Summaries
0,0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."


In [None]:
data.shape

(5449, 4)

In [None]:
data['Category'].value_counts()

Category
business         1228
politics         1158
sport            1021
entertainment     925
tech              802
crime             110
lifestyle          78
law                41
sports             30
science            25
technology         18
accidents           4
architecture        4
art                 2
health              2
environment         1
Name: count, dtype: int64

In [None]:
data = data[data['Category'] == 'entertainment'].reset_index(drop=True)

In [None]:
data.drop(columns=['Unnamed: 0','Category'], inplace=True)

In [None]:
data.head()

Unnamed: 0,Articles,Summaries
0,Super Size Me wins writers' award..Super Size ...,Spurlock was given his award on the same day t...
1,"Mogul Wilson backing UK rap band..Tony Wilson,...","Tony Wilson, the music mogul who established t..."
2,Police praise 'courageous' Ozzy..Rock star Ozz...,"""I could have been badly injured or shot or an..."
3,Eastwood's Baby scoops top Oscars..Clint Eastw...,The boxing drama was named best picture and Ea...
4,Actor Scott is new Bond favourite..Bookmaker W...,Bookmaker William Hill has stopped taking bets...


In [None]:
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

In [None]:
train_ds = Dataset.from_pandas(data_train, preserve_index=False)
test_ds = Dataset.from_pandas(data_test, preserve_index=False)

In [None]:
articles = DatasetDict()

In [None]:
articles['train'] = train_ds
articles['test'] = test_ds

In [None]:
articles

DatasetDict({
    train: Dataset({
        features: ['Articles', 'Summaries'],
        num_rows: 832
    })
    test: Dataset({
        features: ['Articles', 'Summaries'],
        num_rows: 93
    })
})

### Data Pre-processing

In [None]:
BATCH_SIZE = 4
NUM_PROCS = 4
EPOCHS = 10
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512

In [None]:
dataset_train = articles['train']
dataset_valid = articles['test']

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['Articles']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in examples['Summaries']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]



In [None]:
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

      

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]



In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
rouge = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0003,
    dataloader_num_workers=4
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

In [None]:
history = trainer.train()



Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,0.4788,0.234253,0.9157,0.8527,0.9018,191.0108
400,0.309,0.200994,0.9257,0.8668,0.9133,191.2043
600,0.3917,0.185056,0.9319,0.8784,0.9204,191.2043
800,0.2849,0.175317,0.9357,0.8852,0.925,191.2043
1000,0.2737,0.173083,0.9376,0.8896,0.9277,191.2043




In [None]:
trainer.evaluate()



{'eval_loss': 0.17307457327842712,
 'eval_rouge1': 0.9376,
 'eval_rouge2': 0.8891,
 'eval_rougeL': 0.9275,
 'eval_gen_len': 191.2043,
 'eval_runtime': 23.5104,
 'eval_samples_per_second': 3.956,
 'eval_steps_per_second': 0.51,
 'epoch': 10.0}

In [None]:
trainer.save_model("/kaggle/working/t5-base-entertainment-summarizer")

In [None]:
model.save_pretrained("/kaggle/working/t5-base-summarizer")

In [None]:
import zipfile
import os
from IPython.display import FileLink

def zip_dir(directory = os.curdir, file_name = 'directory.zip'):
    """
    zip all the files in a directory

    Parameters
    _____
    directory: str
        directory needs to be zipped, defualt is current working directory

    file_name: str
        the name of the zipped file (including .zip), default is 'directory.zip'

    Returns
    _____
    Creates a hyperlink, which can be used to download the zip file)
    """
    os.chdir(directory)
    zip_ref = zipfile.ZipFile(file_name, mode='w')
    for folder, _, files in os.walk(directory):
        for file in files:
            if file_name in file:
                pass
            else:
                zip_ref.write(os.path.join(folder, file))

    return FileLink(file_name)

In [None]:
zip_dir()