<a href="https://colab.research.google.com/github/Hicham-Yezza/Hicham-Yezza/blob/main/T5-fine-tuning-190824.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the required libraries
!pip install transformers torch datasets nltk

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cu

In [None]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
import torch

# Step 1: Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Step 2: Load the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Step 3: Load the T5 tokenizer and model, and move the model to the GPU
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)  # Ensure model is on GPU

# Step 4: Preprocess the dataset
def preprocess_function(examples):
    # Add prefix "summarize: " to guide the model
    inputs = ["summarize: " + doc for doc in examples["article"]]

    # Tokenize inputs and ensure padding and truncation
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",  # Ensure all inputs are the same length
        truncation=True
    )

    # Tokenize targets (summaries) and ensure padding and truncation
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["highlights"],
            max_length=150,
            padding="max_length",  # Ensure all labels are the same length
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])

# Use a smaller subset of the dataset for quicker testing
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(500))

# Step 5: Define data collator to handle padding during training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Step 6: Set up fine-tuning arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    logging_dir='./logs',
    report_to=[],  # Avoid external integrations (like Azure ML)
    fp16=True,  # Enable mixed precision for faster training on GPU
    load_best_model_at_end=True,  # Automatically load the best model found during training
)

# Step 7: Initialize the Seq2SeqTrainer with Early Stopping
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,  # Ensure correct padding and batching
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Add early stopping
)

# Step 8: Fine-tune the model
trainer.train()

# Step 9: Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")

# Step 10: Generate a summary using the fine-tuned model
def generate_summary(text):
    input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(input_ids, max_length=150, num_beams=2, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test the model on a sample from the validation set
sample_text = small_eval_dataset[0]["article"]
print("Original Text: ", sample_text)
print("Generated Summary: ", generate_summary(sample_text))
print("Reference Summary: ", small_eval_dataset[0]["highlights"])


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]



Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]



ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps

In [None]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
import torch

# Step 1: Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Step 2: Load the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Step 3: Load the T5 tokenizer and model, and move the model to the GPU
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)  # Ensure model is on GPU

# Step 4: Preprocess the dataset
def preprocess_function(examples):
    # Add prefix "summarize: " to guide the model
    inputs = ["summarize: " + doc for doc in examples["article"]]

    # Tokenize inputs and ensure padding and truncation
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",  # Ensure all inputs are the same length
        truncation=True
    )

    # Tokenize targets (summaries) and ensure padding and truncation
    labels = tokenizer(
        text_target=examples["highlights"],
        max_length=150,
        padding="max_length",  # Ensure all labels are the same length
        truncation=True
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])

# Use a smaller subset of the dataset for quicker testing
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(500))

# Step 5: Define data collator to handle padding during training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Step 6: Set up fine-tuning arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Ensure evaluation strategy is the same as save strategy
    save_strategy="epoch",  # Save at the end of each epoch to match evaluation strategy
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    logging_dir='./logs',
    report_to=[],  # Avoid external integrations (like Azure ML)
    fp16=True,  # Enable mixed precision for faster training on GPU
    load_best_model_at_end=True,  # Automatically load the best model found during training
)

# Step 7: Initialize the Seq2SeqTrainer with Early Stopping
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,  # Ensure correct padding and batching
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Add early stopping
)

# Step 8: Fine-tune the model
trainer.train()

# Step 9: Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")

# Step 10: Generate a summary using the fine-tuned model
def generate_summary(text):
    input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(input_ids, max_length=150, num_beams=2, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test the model on a sample from the validation set
sample_text = small_eval_dataset[0]["article"]
print("Original Text: ", sample_text)
print("Generated Summary: ", generate_summary(sample_text))
print("Reference Summary: ", small_eval_dataset[0]["highlights"])


Using device: cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,0.884513
2,No log,0.847309
3,No log,0.844849


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


KeyError: 'article'

In [None]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
import torch

# Step 1: Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Step 2: Load the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Step 3: Load the T5 tokenizer and model, and move the model to the GPU
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)  # Ensure model is on GPU

# Step 4: Preprocess the dataset
def preprocess_function(examples):
    # Add prefix "summarize: " to guide the model
    inputs = ["summarize: " + doc for doc in examples["article"]]

    # Tokenize inputs and ensure padding and truncation
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",  # Ensure all inputs are the same length
        truncation=True
    )

    # Tokenize targets (summaries) and ensure padding and truncation
    labels = tokenizer(
        text_target=examples["highlights"],
        max_length=150,
        padding="max_length",  # Ensure all labels are the same length
        truncation=True
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])

# Use a smaller subset of the dataset for quicker testing
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(500))

# Step 5: Define data collator to handle padding during training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Step 6: Set up fine-tuning arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Ensure evaluation strategy is the same as save strategy
    save_strategy="epoch",  # Save at the end of each epoch to match evaluation strategy
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    logging_dir='./logs',
    report_to=[],  # Avoid external integrations (like Azure ML)
    fp16=True,  # Enable mixed precision for faster training on GPU
    load_best_model_at_end=True,  # Automatically load the best model found during training
)

# Step 7: Initialize the Seq2SeqTrainer with Early Stopping
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,  # Ensure correct padding and batching
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Add early stopping
)

# Step 8: Fine-tune the model
trainer.train()

# Step 9: Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")

# Step 10: Generate a summary using the fine-tuned model
def generate_summary(text):
    input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(input_ids, max_length=150, num_beams=2, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Step 11: Decode and Test the model on a sample from the validation set
decoded_article = tokenizer.decode(small_eval_dataset[0]['input_ids'], skip_special_tokens=True, clean_up_tokenization_spaces=True)
print("Original Text: ", decoded_article)
print("Generated Summary: ", generate_summary(decoded_article))
print("Reference Summary: ", tokenizer.decode(small_eval_dataset[0]['labels'], skip_special_tokens=True, clean_up_tokenization_spaces=True))


Using device: cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch,Training Loss,Validation Loss
1,No log,0.884513
2,No log,0.847309
3,No log,0.844849


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Original Text:  summarize: Jarryd Hayne's move to the NFL is a boost for rugby league in the United States, it has been claimed. The Australia international full-back or centre quit the National Rugby League in October to try his luck in American football and was this week given a three-year contract with the San Francisco 49ers. Peter Illfield, chairman of US Association of Rugby League, said: 'Jarryd, at 27, is one of the most gifted and talented rugby league players in Australia. He is an extraordinary athlete. Jarryd Hayne (right) has signed with the San Francisco 49ers after quitting the NRL in October. Hayne, who played rugby league for Australia, has signed a three year contract with the 49ers. 'His three-year deal with the 49ers, as an expected running back, gives the USA Rugby League a connection with the American football lover like never before. 'Jarryd's profile and playing ability will bring our sport to the attention of many. It also has the possibility of showing the Ame

In [None]:
from datasets import load_dataset, load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
import torch

# Step 1: Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Step 2: Load the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Step 3: Load the T5 tokenizer and model, and move the model to the GPU
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

# Re-initialize the model's token embeddings to match the new tokenizer size
model.resize_token_embeddings(len(tokenizer))

# Step 4: Preprocess the dataset
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    labels = tokenizer(text_target=examples["highlights"], max_length=150, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(500))

# Step 5: Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Step 6: Set up fine-tuning arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    logging_dir='./logs',
    report_to=[],
    fp16=True,
    load_best_model_at_end=True,
)

# Step 7: Initialize the Seq2SeqTrainer with Early Stopping
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Step 8: Fine-tune the model
trainer.train()

# Step 9: Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")

# Step 10: Generate a summary using the fine-tuned model
def generate_summary(text):
    input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(input_ids, max_length=150, num_beams=2, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Step 11: Decode and Test the model on a sample from the validation set
decoded_article = tokenizer.decode(small_eval_dataset[0]['input_ids'], skip_special_tokens=True, clean_up_tokenization_spaces=True)
print("Original Text: ", decoded_article)
print("Generated Summary: ", generate_summary(decoded_article))
print("Reference Summary: ", tokenizer.decode(small_eval_dataset[0]['labels'], skip_special_tokens=True, clean_up_tokenization_spaces=True))


Using device: cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]