##PEFT Training
Traditional finetuning
###Imports

In [None]:
!pip install -U transformers datasets scikit-learn

Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.53.1-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m125.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.7.0-cp311-cp311-manylinux_

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, get_scheduler
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
import numpy as np
from datasets import load_dataset

###Make 10% of the training data for validation

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
ds = load_dataset("Salesforce/cos_e", "v1.11", split={"train": "train", "test": "validation"})

def modify_example(example):
    choice_labels = ['(a)', '(b)', '(c)', '(d)', '(e)']
    formatted_choices = "\n".join([f"{choice_labels[i]} {choice}" for i, choice in enumerate(example["choices"])])
    input_text = f"{example['question']}\nAnswer Choices:\n{formatted_choices}"
    return {
        "input": input_text,
        "label": example["answer"]
    }

dataset = {split: data.map(modify_example, remove_columns=['id', 'question', 'choices', 'answer', 'abstractive_explanation', 'extractive_explanation']) for split, data in ds.items()}

# Create validation split from original training data
val_dataset = dataset["train"]
val_size = int(0.1 * len(val_dataset))  # 10% for validation
train_size = len(val_dataset) - val_size

# Create the split
train_set, val_set = random_split(
    val_dataset,
    [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.73M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/222k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9741 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1221 [00:00<?, ? examples/s]

Map:   0%|          | 0/9741 [00:00<?, ? examples/s]

Map:   0%|          | 0/1221 [00:00<?, ? examples/s]

In [None]:
data_splits = {'train': train_set, 'val': val_set, 'test': dataset['test']}

for split_name, split_data in data_splits.items():
    print(f"Length of {split_name}_set:", len(split_data))
    print(f"First element of {split_name}_set:", split_data[0])

Length of train_set: 8767
First element of train_set: {'input': 'Where might someone keep personal soap?\nAnswer Choices:\n(a) birthday party\n(b) supermarket\n(c) own home\n(d) jail\n(e) cabinet', 'label': 'own home'}
Length of val_set: 974
First element of val_set: {'input': 'What do you have to do to learn to play violin?\nAnswer Choices:\n(a) tune\n(b) practise\n(c) relaxing\n(d) ask questions\n(e) take lessons', 'label': 'take lessons'}
Length of test_set: 1221
First element of test_set: {'input': 'A beaver is know for building prowess, their supplies come from where?\nAnswer Choices:\n(a) british columbia\n(b) body of water\n(c) wooded area\n(d) pay debts\n(e) zoo', 'label': 'wooded area'}


In [None]:
class T5FineTuningDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=128):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]

        # Tokenize inputs
        input_encoding = self.tokenizer(
            example['input'],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Tokenize labels
        label_encoding = self.tokenizer(
            example['label'],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Replace padding token ID with -100
        labels = label_encoding.input_ids.clone()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_encoding.input_ids.squeeze(),
            'attention_mask': input_encoding.attention_mask.squeeze(),
            'labels': labels.squeeze()
        }

In [None]:
def validate_model(model, val_loader, device):
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    return total_val_loss / len(val_loader)

###Training

In [None]:
def train_model(train_set, val_set, model_name="google-t5/t5-small", max_length=1024,
                batch_size=2, num_epochs=25, learning_rate=3e-5, weight_decay=0.01):  # reduced batch size from 16 to 2 due to CUDA memory error

    # Load model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    device = "cuda"
    model.to(device)

    # Prepare training and validation datasets
    train_dataset = T5FineTuningDataset(train_set, tokenizer, max_length)
    val_dataset = T5FineTuningDataset(val_set, tokenizer, max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    total_steps = len(train_loader) * num_epochs

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # Initialize variables to track best model
    best_val_loss = float('inf')
    best_model_state = None

    # Training loop
    print("Starting training...")
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Training
        model.train()
        total_train_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Training Epoch {epoch+1}")

        for batch in progress_bar:
            # Move batch to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

            progress_bar.set_postfix({"loss": loss.item()})

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Epoch {epoch+1} - Average training loss: {avg_train_loss:.4f}")

        # Validation
        print("Running validation...")
        val_loss = validate_model(model, val_loader, device)
        print(f"Epoch {epoch+1} - Validation loss: {val_loss:.4f}")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()
            print(f"New best model saved with validation loss: {val_loss:.4f}")

    # Load best model for final evaluation
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print(f"Loaded best model with validation loss: {best_val_loss:.4f}")

    print("Training completed!")
    return model, tokenizer

In [None]:
model, tokenizer = train_model(train_set, val_set)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Starting training...
Epoch 1/25


Training Epoch 1:   0%|          | 0/4384 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1 - Average training loss: 1.1833
Running validation...
Epoch 1 - Validation loss: 0.7182
New best model saved with validation loss: 0.7182
Epoch 2/25


Training Epoch 2:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 2 - Average training loss: 0.8947
Running validation...
Epoch 2 - Validation loss: 0.6800
New best model saved with validation loss: 0.6800
Epoch 3/25


Training Epoch 3:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 3 - Average training loss: 0.7850
Running validation...
Epoch 3 - Validation loss: 0.6685
New best model saved with validation loss: 0.6685
Epoch 4/25


Training Epoch 4:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 4 - Average training loss: 0.7122
Running validation...
Epoch 4 - Validation loss: 0.6824
Epoch 5/25


Training Epoch 5:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 5 - Average training loss: 0.6523
Running validation...
Epoch 5 - Validation loss: 0.6965
Epoch 6/25


Training Epoch 6:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 6 - Average training loss: 0.5977
Running validation...
Epoch 6 - Validation loss: 0.7194
Epoch 7/25


Training Epoch 7:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 7 - Average training loss: 0.5501
Running validation...
Epoch 7 - Validation loss: 0.7253
Epoch 8/25


Training Epoch 8:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 8 - Average training loss: 0.5141
Running validation...
Epoch 8 - Validation loss: 0.7414
Epoch 9/25


Training Epoch 9:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 9 - Average training loss: 0.4679
Running validation...
Epoch 9 - Validation loss: 0.8105
Epoch 10/25


Training Epoch 10:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 10 - Average training loss: 0.4352
Running validation...
Epoch 10 - Validation loss: 0.8216
Epoch 11/25


Training Epoch 11:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 11 - Average training loss: 0.3979
Running validation...
Epoch 11 - Validation loss: 0.8363
Epoch 12/25


Training Epoch 12:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 12 - Average training loss: 0.3760
Running validation...
Epoch 12 - Validation loss: 0.8645
Epoch 13/25


Training Epoch 13:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 13 - Average training loss: 0.3549
Running validation...
Epoch 13 - Validation loss: 0.8583
Epoch 14/25


Training Epoch 14:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 14 - Average training loss: 0.3361
Running validation...
Epoch 14 - Validation loss: 0.8899
Epoch 15/25


Training Epoch 15:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 15 - Average training loss: 0.3148
Running validation...
Epoch 15 - Validation loss: 0.9213
Epoch 16/25


Training Epoch 16:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 16 - Average training loss: 0.3008
Running validation...
Epoch 16 - Validation loss: 0.9355
Epoch 17/25


Training Epoch 17:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 17 - Average training loss: 0.2786
Running validation...
Epoch 17 - Validation loss: 0.9234
Epoch 18/25


Training Epoch 18:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 18 - Average training loss: 0.2732
Running validation...
Epoch 18 - Validation loss: 0.9608
Epoch 19/25


Training Epoch 19:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 19 - Average training loss: 0.2535
Running validation...
Epoch 19 - Validation loss: 1.0122
Epoch 20/25


Training Epoch 20:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 20 - Average training loss: 0.2409
Running validation...
Epoch 20 - Validation loss: 1.0128
Epoch 21/25


Training Epoch 21:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 21 - Average training loss: 0.2377
Running validation...
Epoch 21 - Validation loss: 1.0206
Epoch 22/25


Training Epoch 22:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 22 - Average training loss: 0.2305
Running validation...
Epoch 22 - Validation loss: 1.0251
Epoch 23/25


Training Epoch 23:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 23 - Average training loss: 0.2273
Running validation...
Epoch 23 - Validation loss: 1.0233
Epoch 24/25


Training Epoch 24:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 24 - Average training loss: 0.2299
Running validation...
Epoch 24 - Validation loss: 1.0273
Epoch 25/25


Training Epoch 25:   0%|          | 0/4384 [00:00<?, ?it/s]

Epoch 25 - Average training loss: 0.2168
Running validation...
Epoch 25 - Validation loss: 1.0284
Loaded best model with validation loss: 0.6685
Training completed!


###Evaluation

In [None]:
def evaluate_model(model, tokenizer, test_dataset, max_length=1024, batch_size=16):
    device = "cuda"
    model.to(device)

    # Prepare test dataset
    test_dataset_processed = T5FineTuningDataset(test_dataset, tokenizer, max_length)
    test_loader = DataLoader(test_dataset_processed, batch_size=batch_size)

    model.eval()
    predictions = []
    references = []

    print("Starting evaluation...")
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Generate predictions
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_length
            )

            # Decode predictions and references
            preds = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

            # Replace -100 with pad token id first
            label_ids = labels.clone()
            label_ids[label_ids == -100] = tokenizer.pad_token_id
            refs = [tokenizer.decode(label, skip_special_tokens=True) for label in label_ids]

            predictions.extend(preds)
            references.extend(refs)

    # Calculate accuracy
    accuracy = accuracy_score(references, predictions) * 100
    return accuracy, predictions, references

In [None]:
accuracy, predictions, references = evaluate_model(model, tokenizer, dataset["test"])

Starting evaluation...


Evaluating:   0%|          | 0/77 [00:00<?, ?it/s]

  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")


###Results

In [None]:
# 2nd run - Final Evaluation Accuracy: %
def display_evaluation_results(accuracy, predictions, references, num_samples=5):

    print(f"Final Evaluation Accuracy: {accuracy:.2f}%")

    # Display some examples
    print("\nSample predictions:")
    for i in range(min(num_samples, len(predictions))):
        print(f"Reference: {references[i]}")
        print(f"Prediction: {predictions[i]}")
        print("-" * 50)


In [None]:
display_evaluation_results(accuracy, predictions, references, num_samples=5)

Final Evaluation Accuracy: 43.98%

Sample predictions:
Reference: wooded area
Prediction: wooded area
--------------------------------------------------
Reference: go downtown
Prediction: go downtown
--------------------------------------------------
Reference: play tag
Prediction: play tag
--------------------------------------------------
Reference: great outdoors
Prediction: corn fields
--------------------------------------------------
Reference: club
Prediction: meeting
--------------------------------------------------


In [None]:
# 1st run - Final Evaluation Accuracy: 42.67%
def display_evaluation_results(accuracy, predictions, references, num_samples=5):

    print(f"Final Evaluation Accuracy: {accuracy:.2f}%")

    # Display some examples
    print("\nSample predictions:")
    for i in range(min(num_samples, len(predictions))):
        print(f"Reference: {references[i]}")
        print(f"Prediction: {predictions[i]}")
        print("-" * 50)


In [None]:
display_evaluation_results(accuracy, predictions, references, num_samples=5)

Final Evaluation Accuracy: 42.67%

Sample predictions:
Reference: wooded area
Prediction: british columbia
--------------------------------------------------
Reference: go downtown
Prediction: go downtown
--------------------------------------------------
Reference: play tag
Prediction: play tag
--------------------------------------------------
Reference: great outdoors
Prediction: corn fields
--------------------------------------------------
Reference: club
Prediction: meeting
--------------------------------------------------


Results vary between runs.