
# Fine-tuning BART for summarization

---

## Setup

---

In [None]:
# Installing the required dependencies

%%capture
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install wandb

In [None]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/261.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [None]:
import torch
import numpy as np
import datasets

# Importing the required libraries from transformers

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

from tabulate import tabulate
import nltk
from datetime import datetime

In [None]:
# Using wandb integration to show weights and biases and monitor loss to give a graphical representation

WANDB_INTEGRATION = True
if WANDB_INTEGRATION:
    import wandb

    wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Set language

---

English

In [None]:
language = "english"           # Setting the text language to English

## Model and tokenizer

---

Download model and tokenizer. Use default parameters or try custom values (see [HF Bart configuration](https://huggingface.co/transformers/_modules/transformers/configuration_bart.html) and [Fairseq Bart](https://github.com/pytorch/fairseq/tree/master/examples/bart)).

In [None]:
model_name = "facebook/bart-base"                                # Using a base bart model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)        # Using the pretrained model for our purpose
tokenizer = AutoTokenizer.from_pretrained(model_name)            # Using the pretrained model for tokenizer

# tokenization
encoder_max_length = 512                                        # Setting parameters of tokenization
decoder_max_length = 128

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Data

---

### Download

In [None]:
data = datasets.load_dataset("cnn_dailymail",'3.0.0',split="train[:10%]")               # Downloading the CNN-Dailymail dataset

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

For demonstration, we are only using a small portion of the data.

In [None]:
data[5]                                                     # Showing an example of data

{'article': 'BAGHDAD, Iraq (CNN) -- Dressed in a Superman shirt, 5-year-old Youssif held his sister\'s hand Friday, seemingly unaware that millions of people across the world have been touched by his story. Nearby, his parents talked about the new future and hope they have for their boy -- and the potential for recovery from his severe burns. Youssif holds his sister\'s hand Friday. He\'s wearing a facial mask often used to help burn victims. It\'s the best birthday present the Iraqi family could ever have imagined for their boy: Youssif turns 6 next Friday. "I was so happy I didn\'t know what to do with myself," his mother, Zainab, told CNN, a broad smile across her face. "I didn\'t think the reaction would be this big." His father said he was on the roof of his house when CNN called him with the news about the outpouring of support for his son. "We just want to thank everyone who has come forward," he said. "We knew there was kindness out there." Like his wife, he couldn\'t stop smil

### Prepare

**Format and split into train and validation sets**

In [None]:
train_data_txt, validation_data_txt = data.train_test_split(test_size=0.1).values()             # Splitting the data into training and validation

In [None]:
train_data_txt                                                                                  # Features of training data

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 25839
})

**Preprocess and tokenize**

In [None]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["article"], batch["highlights"]                                    # Divinding data into source and target
    source_tokenized = tokenizer(                                                             # Applying the tokenizer to source
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(                                                             # Applying the tokenizer to target
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}                                       # Iterating through the tokens of source
    # Ignore padding in the loss
    batch["labels"] = [                                                                       # Giving labels to tokens
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch                                                                              # Returns the updated batch

train_data = train_data_txt.map(                                                              # Generate training data using the above defined function
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(                                                    # Generate validation data using the above defined function
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

Map:   0%|          | 0/25839 [00:00<?, ? examples/s]

Map:   0%|          | 0/2872 [00:00<?, ? examples/s]

## Training

---

### Metrics

In [None]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)
metric = datasets.load_metric("rouge")                                            # Using the rouge matrix for evaluation


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]                                      # Stripping the predictions
    labels = [label.strip() for label in labels]                                  # Stripping the labels

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]               # Adding new line after each prediction
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]           # Adding new line after each label

    return preds, labels


def compute_metrics(eval_preds):                                                  # Defining the computation metrics
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)       # Decoding the predictions
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)     # Decoding the labels

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)   # Processing to evaluate metrics

    result = metric.compute(                                                      # Gives the evaluated metric
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)                                  # Mean of predictions
    result = {k: round(v, 4) for k, v in result.items()}                          # Showing the results
    return result

  metric = datasets.load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

### Training arguments

In [None]:
# Training arguments for the transformer

training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=1,  # demo                                               # Due to less computation power
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # demo
    per_device_eval_batch_size=4,
    # learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=50,
    save_total_limit=3,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Defining the trainer for transformer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
import transformers
import accelerate

print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)


Transformers version: 4.35.2
Accelerate version: 0.24.1


### Train

Wandb integration

In [None]:
if WANDB_INTEGRATION:
    wandb_run = wandb.init(
        project="Fine-tune Bart on CNN-daily Only",
        config={
            "per_device_train_batch_size": training_args.per_device_train_batch_size,
            "learning_rate": training_args.learning_rate,
            "dataset": "Fine-tune Bart on CNN-daily Only"
        },
    )

    now = datetime.now()
    current_time = now.strftime("%H%M%S")
    wandb_run.name = "run_" + "Fine-tune Bart on CNN-daily Only" + "_" + current_time

[34m[1mwandb[0m: Currently logged in as: [33manikan[0m ([33maml01[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112779033333532, max=1.0…

Evaluate before fine-tuning

In [None]:
trainer.evaluate()            # Evaluating the current trainer without any fine tuning

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 5.800720691680908,
 'eval_rouge1': 15.9206,
 'eval_rouge2': 5.2129,
 'eval_rougeL': 12.6713,
 'eval_rougeLsum': 14.5374,
 'eval_gen_len': 20.0,
 'eval_runtime': 340.1088,
 'eval_samples_per_second': 8.444,
 'eval_steps_per_second': 2.111}

Train the model

In [None]:
%%wandb
# uncomment to display Wandb charts

trainer.train()

Step,Training Loss
50,5.421
100,4.4943
150,4.2188
200,3.9219
250,3.8799
300,3.7814
350,3.7818
400,3.87
450,3.7978
500,3.6967


TrainOutput(global_step=6460, training_loss=3.503844100550601, metrics={'train_runtime': 2474.2693, 'train_samples_per_second': 10.443, 'train_steps_per_second': 2.611, 'total_flos': 7877490172231680.0, 'train_loss': 3.503844100550601, 'epoch': 1.0})

Evaluate after fine-tuning

In [None]:
trainer.evaluate()                 # Evaluating after fine tuning



{'eval_loss': 3.1881580352783203,
 'eval_rouge1': 24.0467,
 'eval_rouge2': 10.7807,
 'eval_rougeL': 19.7447,
 'eval_rougeLsum': 22.4718,
 'eval_gen_len': 20.0,
 'eval_runtime': 343.4575,
 'eval_samples_per_second': 8.362,
 'eval_steps_per_second': 2.091,
 'epoch': 1.0}

In [None]:
if WANDB_INTEGRATION:
    wandb_run.finish()

VBox(children=(Label(value='0.001 MB of 0.013 MB uploaded\r'), FloatProgress(value=0.08962877365304059, max=1.…

0,1
eval/gen_len,▁▁
eval/loss,█▁
eval/rouge1,▁█
eval/rouge2,▁█
eval/rougeL,▁█
eval/rougeLsum,▁█
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████

0,1
eval/gen_len,20.0
eval/loss,3.18816
eval/rouge1,24.0467
eval/rouge2,10.7807
eval/rougeL,19.7447
eval/rougeLsum,22.4718
eval/runtime,343.4575
eval/samples_per_second,8.362
eval/steps_per_second,2.091
train/epoch,1.0


## Evaluation

---

**Generate summaries from the fine-tuned model and compare them with those generated from the original, pre-trained one.**

In [None]:
# Comparing the results from before after fine tuning

def generate_summary(test_samples, model):                                      # Giving inputs as test samples and our model
    inputs = tokenizer(                                                         # Generating input tokenizer
        test_samples["article"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)                               # Generating input ids
    attention_mask = inputs.attention_mask.to(model.device)                     # Generating attention mask
    outputs = model.generate(input_ids, attention_mask=attention_mask)          # Final output
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)      # Stripping and decoding
    return outputs, output_str


model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_name)

test_samples = validation_data_txt.select(range(16))

summaries_before_tuning = generate_summary(test_samples, model_before_tuning)[1]
summaries_after_tuning = generate_summary(test_samples, model)[1]

In [None]:
# Printing and showing the final results

print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
            summaries_before_tuning,
        ),
        headers=["Id", "Summary after", "Summary before"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(test_samples["highlights"])), headers=["Id", "Target summary"])
)
print("\nSource documents:\n")
print(tabulate(list(enumerate(test_samples["article"])), headers=["Id", "Document"]))

  Id  Summary after                                                                                       Summary before
----  --------------------------------------------------------------------------------------------------  -----------------------------------------------------------------------------------------
   0  Federal officials have given the go-ahead for a motorcycle race in California's Johnson Valley      (CNN) -- Federal officials have given the go-ahead for a motorcycle race in
   1  Dr. Carter G. Woodson was a pioneer in the study of African-American                                (CNN Student News) -- February marks the beginning of Black History Month, a federally
   2  Franck Ribery and Karim Benzema are accused of soliciting an underage                               (CNN) -- Two of France's most high-profile football stars will face a
   3  Novak Djokovic beats Gilles Simon 6-3 6-1 7-                                                        (CNN) -- Serbia and France

In [None]:
trainer.save_model("bart_orig")         # Saving the model

In [None]:
!zip -r /content/bart_orig.zip /content/bart_orig

  adding: content/bart_orig/ (stored 0%)
  adding: content/bart_orig/generation_config.json (deflated 45%)
  adding: content/bart_orig/merges.txt (deflated 53%)
  adding: content/bart_orig/tokenizer_config.json (deflated 76%)
  adding: content/bart_orig/special_tokens_map.json (deflated 52%)
  adding: content/bart_orig/vocab.json (deflated 59%)
  adding: content/bart_orig/config.json (deflated 64%)
  adding: content/bart_orig/model.safetensors (deflated 8%)
  adding: content/bart_orig/tokenizer.json (deflated 72%)
  adding: content/bart_orig/training_args.bin (deflated 51%)
