## Check on what you're running the .ipynb

In [2]:
!nvidia-smi

Wed Jan 31 19:22:27 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Project setup

In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
!pip install --upgrade accelerate -q
!pip uninstall -y transformers accelerate -q
!pip install transformers accelerate -q

## Load packages and functions

In [5]:
from transformers import pipeline, set_seed, AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk, load_metric, load_dataset
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Load the [model](https://huggingface.co/google/pegasus-cnn_dailymail) and the tokenizer (pegasus tokenizer)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_checkpoint = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

# Download and unzip the data

In [None]:
!wget https://github.com/NicolaCortinovis/Demo-repo/raw/main/samsum.zip
!unzip samsum.zip -d samsum_data

Load the dataset

In [8]:
samsum_dataset = load_from_disk("samsum_data/samsum_dataset")

samsum_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

# Brief data exploration

In [9]:
def print_split_details(dataset, split):
    """Print the details of a dataset split."""
    length = len(dataset[split])
    print(f"Length of the {split} split: {length}")
    print(f"Features: {dataset[split].column_names}\n")

def print_example(dataset, split):
    """Print an example of dialogue and summary from a dataset split."""
    example = dataset[split][0]
    print(f"Example of dialogue:\n\n{example['dialogue']}\n\nExample of summary:\n\n{example['summary']}")

# Print the details of each split
for split in samsum_dataset:
    print_split_details(samsum_dataset, split)

# Print an example of dialogue and summary from the train split
print_example(samsum_dataset, "train")

Length of the train split: 14732
Features: ['id', 'dialogue', 'summary']

Length of the test split: 819
Features: ['id', 'dialogue', 'summary']

Length of the validation split: 818
Features: ['id', 'dialogue', 'summary']

Example of dialogue:

Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)

Example of summary:

Amanda baked cookies and will bring Jerry some tomorrow.


# Data preprocessing

In [10]:
def convert_examples_to_features(example_batch):
    """
    Convert a batch of examples to model features.

    Args:
        example_batch: A batch of examples.
            Each example is a dictionary with 'dialogue' and 'summary' keys.

    Returns:
        Dict: A dictionary with keys 'input_ids', 'attention_mask', and 'labels'.
            Each value is a list of tokenized inputs.

    Raises:
        ValueError: If 'dialogue' or 'summary' keys are not in example_batch.
    """
    if 'dialogue' not in example_batch or 'summary' not in example_batch:
        raise ValueError("'dialogue' and 'summary' keys must be in example_batch")

    # Tokenize the 'dialogue' field of each example in the batch.
    input_encodings = tokenizer(example_batch['dialogue'], max_length=1024, truncation=True)

    # Use the tokenizer as a target tokenizer.
    with tokenizer.as_target_tokenizer():
        # Tokenize the 'summary' field of each example in the batch.
        target_encodings = tokenizer(example_batch['summary'], max_length=128, truncation=True)

    # Return a dictionary containing the input IDs, attention masks, and labels for each example in the batch.
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [None]:
samsum_data_preproc = samsum_dataset.map(convert_examples_to_features, batched = True)

In [12]:
samsum_data_preproc["train"] # Now 6 columns

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

# Training

## Data collator

In [13]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model = model_pegasus)

In [17]:
training_args = TrainingArguments(
    # Directory where the outputs (checkpoints, predictions, etc.) will be saved
    output_dir='pegasus-samsum',

    # Training parameters
    num_train_epochs=1,
    warmup_steps=500,
    weight_decay=0.01,
    gradient_accumulation_steps=16,

    # Batch sizes
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    # Logging and evaluation parameters
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,

    # Checkpoint saving parameters
    save_steps=1e6,
)

## Trainer
 ___note that here we used only a part of the training for computation purposes, remember to change this if you want to train and obtain proper evaluations___



In [28]:
training_subset = samsum_data_preproc["train"].select(range(1000))

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [30]:

trainer = Trainer(
    # The model to be trained
    model=model_pegasus,

    # Training arguments (like the number of epochs, batch size, etc.)
    args=training_args,

    # The tokenizer to be used for preprocessing the data
    tokenizer=tokenizer,

    # The data collator to be used for creating batches of data
    data_collator=seq2seq_data_collator,

    # The datasets to be used for training and evaluation,
    # train_dataset=samsum_data_preproc["train"],
    train_dataset = training_subset,
    eval_dataset=samsum_data_preproc["validation"]
)

In [31]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=62, training_loss=2.359005835748488, metrics={'train_runtime': 209.3231, 'train_samples_per_second': 4.777, 'train_steps_per_second': 0.296, 'total_flos': 369425361887232.0, 'train_loss': 2.359005835748488, 'epoch': 0.99})

# Evaluation

In [32]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """
    Split the dataset into smaller batches that we can process simultaneously.
    This is done by slicing the list into sublists of size 'batch_size'.
    """
    return [list_of_elements[i : i + batch_size] for i in range(0, len(list_of_elements), batch_size)]

def tokenize_and_generate_summaries(article_batch, model, tokenizer, device):
    """
    Tokenize the articles and generate summaries.
    The articles are tokenized and then passed to the model to generate summaries.
    The generated summaries are then decoded and returned.
    """
    # Tokenize the articles
    inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")

    # Generate summaries
    summaries = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), length_penalty=0.8, num_beams=8, max_length=128)

    # Decode the summaries
    decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]

    # Return the decoded summaries
    return [d.replace("", " ") for d in decoded_summaries]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, batch_size=16, device=device, column_text="article", column_summary="highlights"):
    """
    Calculate a metric on a test dataset.
    The dataset is split into batches, and then each batch is processed to generate summaries.
    The summaries are then compared with the target summaries to calculate the metric.
    """
    # Split the dataset into batches
    article_batches = generate_batch_sized_chunks(dataset[column_text], batch_size)
    target_batches = generate_batch_sized_chunks(dataset[column_summary], batch_size)

    # Process each batch
    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
        # Generate summaries for the batch
        decoded_summaries = tokenize_and_generate_summaries(article_batch, model, tokenizer, device)

        # Add the summaries and the target summaries to the metric
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    # Compute and return the metric
    return metric.compute()

## Metrics

In [35]:
# Define the list of ROUGE metrics we want to compute
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

# Load the ROUGE metric from the datasets library
rouge_metric = load_metric('rouge')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


## Compute the score

In [38]:
def calculate_score_and_create_dataframe(test, metric, model, tokenizer, rouge_names, batch_size= 2, column_text = 'dialogue', column_summary= 'summary'):
    """
    Calculate the score on a test dataset and create a DataFrame with the results.

    Args:
        test: The test dataset.
        metric: The metric to calculate.
        model: The model to use for prediction.
        tokenizer: The tokenizer to use for tokenization.
        rouge_names: List of ROUGE metrics to compute.
        batch_size: The batch size. Defaults to 2.
        column_text: The column name for the text. Defaults to 'dialogue'.
        column_summary: The column name for the summary. Defaults to 'summary'.

    Returns:
        pd.DataFrame: A DataFrame with the results.
    """
    # Calculate the score on the test dataset
    score = calculate_metric_on_test_ds(
        test, metric, model, tokenizer, batch_size = batch_size, column_text = column_text, column_summary= column_summary
    )

    # Create a dictionary with the scores for each ROUGE metric
    rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

    # Create a DataFrame with the results
    df = pd.DataFrame(rouge_dict, index = ['pegasus'])

    return df

 ___note that here we used only a part of the test for computation purposes, remember to change this if you want to evaluate on the full testing set___

In [42]:
test_subset = samsum_data_preproc["test"].select(range(100))

In [43]:
calculate_score_and_create_dataframe(
    test = test_subset,
    metric = rouge_metric,
    model = trainer.model,
    tokenizer = tokenizer,
    rouge_names = rouge_names
)

100%|██████████| 50/50 [02:51<00:00,  3.42s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.015223,0.0,0.015113,0.015039


# Saving the model and the tokenizer

In [44]:
model_pegasus.save_pretrained("pegasus-samsum-model")
tokenizer.save_pretrained("tokenizer")

Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

## Loading

In [45]:
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

# Prediction

In [47]:
# Define the generation arguments for the summarization pipeline
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

# Extract the dialogue from the first sample in the test dataset
sample_text = samsum_dataset["test"][0]["dialogue"]

# Extract the reference summary from the first sample in the test dataset
reference = samsum_dataset["test"][0]["summary"]

# Initialize the summarization pipeline with the Pegasus model and the specified tokenizer
pipe = pipeline("summarization", model="pegasus-samsum-model",tokenizer=tokenizer)

# Print the dialogue
print("Dialogue:")
print(sample_text)

# Print the reference summary
print("\nReference Summary:")
print(reference)

# Generate a summary using the Pegasus model and print it
print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Reference Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Model Summary:
Amanda can't find Betty's number. Larry called her last time we were at the park together. Hannah would rather you texted him. Amanda: I'd rather you texted him.
