In [None]:
!pip install evaluate



In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [None]:
from datasets import load_dataset

ds = load_dataset("wikimedia/wikipedia", "20231101.en")
ds

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 6407814
    })
})

In [None]:
model_name='facebook/bart-large-cnn'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 406290432
all model parameters: 406290432
percentage of trainable model parameters: 100.00%


In [None]:
index = 200
max_length = 1024  # Choose an appropriate maximum length based on your model and resources.

# Access the 'train' split of the dataset
article_title = ds['train'][index]['title']  # Access element from the 'train' split
article_text = ds['train'][index]['text'][:max_length]  # Truncate here  # Access element from the 'train' split

# Changed '{text}' to '{article_text}' in the prompt
prompt = f"""
Summarize the following conversation.

{article_text}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{article_text}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

The Plague () is a 1947 absurdist novel by Albert Camus. It tells the story from the point of view of a narrator in the midst of a plague sweeping the French Algerian city of Oran. The narrator remains unknown until the start of the last chapter. The novel presents a snapshot of life in Oran as seen through the author's distinctive absurdist point of view.

Camus used as source material the cholera epidemic that killed a large proportion of Oran's population in 1849, but situated the novel in the 1940s. Oran and its surroundings were struck by disease several times before Camus published his novel. According to an academic study, Oran was decimated by the bubonic plague in 1556 and 1678, but all later outbreaks (in 1921: 185 cases; 1931: 76 cases; and 1944: 95 cases) were very far from the scale of the epidemic described in the novel.

In [None]:
# Shuffle and select a smaller subset
small_dataset = ds["train"].shuffle(seed=42).select(range(1000))  # First 1000 samples

In [None]:
def tokenize_function(examples):
    """
    Tokenizes the input examples for summarization.

    Args:
        examples: A dictionary containing the 'text' and 'title' fields.

    Returns:
        A dictionary containing the tokenized input IDs, attention mask, and labels.
    """
    start_prompt = 'Summarize the following article.\n\n'
    end_prompt = '\n\nSummary: '

    # Construct the prompts using the article text
    prompts = [start_prompt + text + end_prompt for text in examples["text"]]

    # Tokenize prompts and summaries separately
    # Setting max_length directly within tokenizer call
    tokenized_inputs = tokenizer(
        prompts,
        padding="max_length",
        truncation=True,
        max_length=1024,  # Explicitly set max_length here
        return_tensors="pt",
    )
    # For summarization, we'll use the title as the summary for the article.
    tokenized_labels = tokenizer(
        examples["title"],
        padding="max_length",  # Changed padding to 'max_length' to fix the error
        truncation=True,
        max_length=1024,  # Explicitly set max_length here
        return_tensors="pt",
    )

    # Combine tokenized inputs and labels
    return {
        "input_ids": tokenized_inputs.input_ids,
        "attention_mask": tokenized_inputs.attention_mask,  # Add attention mask
        "labels": tokenized_labels.input_ids,
    }

# Apply the tokenize function to the dataset
tokenized_datasets = small_dataset.map(
    tokenize_function, batched=True, remove_columns=['id', 'url', 'title', 'text'], num_proc=8
)

In [None]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [None]:
tokenized_datasets.save_to_disk('tokenized_wikipedia')

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
print(f"Shapes of the datasets:")
print(f"Tokenized: {tokenized_datasets.shape}")  # Access the shape of the entire dataset

print(tokenized_datasets)

Shapes of the datasets:
Tokenized: (10, 3)
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10
})


In [None]:
# print(f"Shapes of the datasets:")
# print(f"Training: {tokenized_datasets['train'].shape}")
# print(f"Validation: {tokenized_datasets['validation'].shape}")
# print(f"Test: {tokenized_datasets['test'].shape}")

# print(tokenized_datasets)

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,  # Rank
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # Updated target modules for BART
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,  # Still SEQ_2_SEQ_LM for summarization
)

In [None]:
peft_model = get_peft_model(original_model,
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 4718592
all model parameters: 411009024
percentage of trainable model parameters: 1.15%


In [None]:
from transformers import DataCollatorForSeq2Seq

output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1,
    remove_unused_columns=False
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=peft_model)
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
peft_trainer.train()

peft_model_path="./wikipediasummary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Step,Training Loss
1,11.0


('./wikipediasummary-checkpoint-local/tokenizer_config.json',
 './wikipediasummary-checkpoint-local/special_tokens_map.json',
 './wikipediasummary-checkpoint-local/vocab.json',
 './wikipediasummary-checkpoint-local/merges.txt',
 './wikipediasummary-checkpoint-local/added_tokens.json',
 './wikipediasummary-checkpoint-local/tokenizer.json')

In [None]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       '.wikipediasummary-checkpoint-local',
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

ValueError: Can't find 'adapter_config.json' at '.wikipediasummary-checkpoint-local'

In [None]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 4718592
all model parameters: 411009024
percentage of trainable model parameters: 1.15%
