In [42]:
from datasets import load_dataset
import evaluate
dataset = load_dataset("glue", "sst2")
metric = evaluate.load("glue", "sst2")

In [43]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 2
model_checkpoint = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [45]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q","k","v","o"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )
    
print_trainable_parameters(lora_model)

trainable params: 3538944 || all params: 227034626 || trainable%: 1.56


In [46]:
import os
model_name = model_checkpoint.split("/")[-1]
batch_size = 64
path = "flan-t5-base-finetuned-lora-sst2"
if not os.path.exists(path):
    os.makedirs(path)

args = TrainingArguments(
    "./flan-t5-base-finetuned-lora-sst2",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-3,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    num_train_epochs=1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [47]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True,padding=True)
encoded_dataset = dataset.map(preprocess_function, batched=True).remove_columns(["sentence","idx"])

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [48]:
import torch
# def collate_fn(examples):
#     data = [tokenizer(example["sentence"], truncation=True) for example in examples]
#     data["label"] = torch.tensor([example["label"] for example in examples])
#     return data

trainer = Trainer(
    lora_model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [49]:
train_results = trainer.train()

  0%|          | 0/526 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.02}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.04}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.06}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.08}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.1}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.11}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.13}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.15}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.17}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.19}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.21}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.23}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.25}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.27}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.29}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.3}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.32}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0.34}
{'loss': 0.0, 'learning_rate': 0.005, 'epoch': 0

  0%|          | 0/28 [00:00<?, ?it/s]

{'eval_runtime': 2.8174, 'eval_samples_per_second': 309.504, 'eval_steps_per_second': 9.938, 'epoch': 1.0}


KeyError: 'eval_accuracy'

In [50]:
trainer.evaluate()

  0%|          | 0/28 [00:00<?, ?it/s]

{'eval_runtime': 2.6988, 'eval_samples_per_second': 323.112, 'eval_steps_per_second': 10.375, 'epoch': 1.0}


{'eval_runtime': 2.6988,
 'eval_samples_per_second': 323.112,
 'eval_steps_per_second': 10.375,
 'epoch': 1.0}