In [1]:
import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)

# from tabulate import tabulate
import nltk
from datetime import datetime
import os

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
os.environ["trust_remote_code"] = "True"

In [4]:
testing_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test[:2000]")

In [5]:
def flatten(example):
    return {
        "document": example["article"],
        "summary": example["highlights"],
    }

def listToSamples(example):
    result = {"document": example["document"], "summary": example["summary"]}
    return result

In [6]:
testing_dataset = testing_data.map(flatten)
testing_dataset = testing_dataset.map(listToSamples)

In [7]:
def preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["document"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [8]:
from transformers import pipeline
# pipeline = pipeline("text2text-generation", model="ahmeddsakrr/text_summarizer_t5", tokenizer="t5-small", device="cuda:0", temperature=1)
# pipeline = pipeline("text2text-generation", model="ahmeddsakrr/text_summarizer_bart", tokenizer="facebook/bart-base", device="cuda:0", temperature=1)
pipeline = pipeline("text2text-generation", model="ahmeddsakrr/text_summarizer_pegasus", tokenizer="google/pegasus-xsum", device="cuda:0",temperature=1)




In [9]:
model = pipeline.model
tokenizer = pipeline.tokenizer

In [10]:
testing_data = testing_dataset.map(
    lambda batch: preprocess(
        batch, tokenizer, 512, 128
    ),
    batched=True,
    remove_columns=testing_dataset.column_names,
)

In [11]:
nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

  metric = datasets.load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [12]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=5,  
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # demo
    per_device_eval_batch_size=4,
    learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="eval_loss",  # Use evaluation loss to determine the best model
    greater_is_better=False,  # Lower evaluation loss indicates a better model
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save at the end of each epoch
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)



In [14]:
model.eval()

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_nor

In [29]:
# used for t5
trainer.predict(testing_data)

  0%|          | 0/2873 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[    0,    37, 10748, ...,     3,     5,    37],
       [    0,    37,    23, ...,   451,    47,   435],
       [    0,  1290,  1483, ...,   118,   412,     5],
       ...,
       [    0, 11859,  6424, ...,    12, 10755,   112],
       [    0,  2184,  1916, ...,   145,   192,    18],
       [    0,     3,  8365, ...,    31,  7248,  2976]], dtype=int64), label_ids=array([[19428,  1527,     8, ...,  -100,  -100,  -100],
       [   37,    23,     9, ...,  -100,  -100,  -100],
       [ 1290,  1483, 11374, ...,  -100,  -100,  -100],
       ...,
       [ 6424,    63,    47, ...,  -100,  -100,  -100],
       [ 2184,  1916,    72, ...,  -100,  -100,  -100],
       [    3,  8365,   302, ...,    31,     7,     1]], dtype=int64), metrics={'test_loss': 0.78, 'test_rouge1': 0.6, 'test_rouge2': 0.52, 'test_rougeL': 0.64, 'test_runtime': 991.9659, 'test_samples_per_second': 11.583, 'test_steps_per_second': 2.896})

In [37]:
# used for bart
trainer.predict(testing_data)   

  attn_output = torch.nn.functional.scaled_dot_product_attention(


  0%|          | 0/2873 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[    2,     0,   133, ..., 50118,   133,     2],
       [    2,     0,   133, ...,    20,   493,     2],
       [    2,     0, 29880, ...,  7076,     7,     2],
       ...,
       [    2,     0, 36128, ...,    12,   180,     2],
       [    2,     0, 20770, ...,    80,    12,     2],
       [    2,     0, 25441, ...,    71,  3357,     2]], dtype=int64), label_ids=array([[    0, 31339,  4128, ...,  -100,  -100,  -100],
       [    0,   133,   493, ...,  -100,  -100,  -100],
       [    0, 29880, 41007, ...,  -100,  -100,  -100],
       ...,
       [    0, 31574,   219, ...,  -100,  -100,  -100],
       [    0, 20770,  1088, ...,  -100,  -100,  -100],
       [    0, 25441,   687, ...,    11,  2920,     2]], dtype=int64), metrics={'test_loss': 0.81, 'test_rouge1': 0.61, 'test_rouge2': 0.58, 'test_rougeL': 0.63, 'test_runtime': 1576.3842, 'test_samples_per_second': 7.289, 'test_steps_per_second': 1.823})

In [15]:
# used for pegasus
trainer.predict(testing_data)

  0%|          | 0/500 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[    0,   139, 18336, ...,     0,     0,     0],
       [    0,   139,  2396, ...,   114, 11494,     1],
       [    0, 26101,  6830, ...,     0,     0,     0],
       ...,
       [    0,  5420,   672, ...,     0,     0,     0],
       [    0, 15254, 15960, ...,     0,     0,     0],
       [    0, 11140, 10379, ...,   142, 11828,     1]], dtype=int64), label_ids=array([[10945,  1106,   109, ...,  -100,  -100,  -100],
       [  139,  2396,   108, ...,  -100,  -100,  -100],
       [26101,  6830,   252, ...,  -100,  -100,  -100],
       ...,
       [20313, 88541,  3999, ...,  -100,  -100,  -100],
       [58869,   547,   109, ...,  -100,  -100,  -100],
       [11140, 10379, 46657, ...,  -100,  -100,  -100]], dtype=int64), metrics={'test_loss': 0.65, 'test_rouge1': 0.67, 'test_rouge2': 0.53, 'test_rougeL': 0.57, 'test_runtime': 8415.3963, 'test_samples_per_second': 0.238, 'test_steps_per_second': 0.059})