In [1]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset
import evaluate
import torch

model_name_or_path = "google/flan-t5-base"
lr = 1e-3
batch_size = 32
num_epochs = 1

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
task = "mnli"
dataset = load_dataset("JsSparkYyx/processed_glue", task)

In [5]:
dataset['train'][:5]

{'source': ['Take the following as truth: Conceptually cream skimming has two basic dimensions - product and geography.\nThen the following statement: "Product and geography are what make cream skimming work. " is true, false, or inconclusive?',
  'Take the following as truth: you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him\nThen the following statement: "You lose the things to the following level if the people recall." is true, false, or inconclusive?',
  'Take the following as truth: One of our number will carry out your instructions minutely.\nThen the following statement: "A member of my team will execute your orders with immense precision." is true, false, or inconclusive?',
  'Take the following as truth: How do you know? All this is their information 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    model_inputs = tokenizer(examples['source'], truncation=True, max_length=None)
    model_inputs['labels'] = tokenizer(examples['target'], truncation=True, max_length=None)["input_ids"]
    return model_inputs
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/19643 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, return_dict=True)
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=16,
    lora_alpha=16,
    # target_modules=["query","key","value"],
    target_modules=["q","k","v","o"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
model_name = model_name_or_path.split("/")[-1]
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, return_dict=True)
model = get_peft_model(model, config)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
model.print_trainable_parameters()

trainable params: 3,538,944 || all params: 251,116,800 || trainable%: 1.4092820552029972


In [None]:
training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-lora-{task}",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    remove_unused_columns=False,
    load_best_model_at_end=False,
)

In [None]:
import evaluate
metric = evaluate.load("sacrebleu")

In [None]:
import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [None]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-lora-{task}",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_epochs,
    remove_unused_columns=False,
    predict_with_generate=True,
    fp16=True,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

  0%|          | 0/245440 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

In [None]:
tokenized_datasets["train"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 392702
})