### Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# CD to Drive Directory for imports
import sys
sys.path.append('drive/MyDrive/TAC2023/PLABA/HuggingFace_FineTuning_Pipeline')

Mounted at /content/drive


In [None]:
# install Hugging Face Libraries
!pip install "peft==0.2.0" --quiet
!pip install "transformers==4.27.2" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
# install additional dependencies needed for training
!pip install rouge-score tensorboard py7zr --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Load and prepare the dataset

In [None]:
from transformers import AutoTokenizer
from Helpers import load_and_preprocess_data

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
tokenized_dataset = load_and_preprocess_data("drive/MyDrive/TAC2023/PLABA/PLABA_Dataset_Multi_Ref", tokenizer, max_input_length=70, max_output_length=70)

ModuleNotFoundError: ignored

### Fine-Tune T5 with LoRA and bnb int-8

In [None]:
from transformers import AutoModelForSeq2SeqLM
import torch

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl", load_in_8bit=True, device_map="auto")

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# trainable params: 18874368 || all params: 11154206720 || trainable%: 0.16921300163961817


trainable params: 9437184 || all params: 2859194368 || trainable%: 0.33006444422319176


In [None]:
from transformers import DataCollatorForSeq2Seq

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    pad_to_multiple_of=8
)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="lora-flan-t5-xl"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	  auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=4,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.0546
1000,0.6538
1500,0.6334
2000,0.5605
2500,0.5408


TrainOutput(global_step=2880, training_loss=0.6658427556355794, metrics={'train_runtime': 6881.831, 'train_samples_per_second': 3.346, 'train_steps_per_second': 0.418, 'total_flos': 2.7788953428688896e+16, 'train_loss': 0.6658427556355794, 'epoch': 4.0})

In [None]:
# Save our LoRA model & tokenizer results
peft_model_id="drive/MyDrive/TAC2023/PLABA/LoRAs/FLAN-t5-xl/results"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)
# if you want to save the base model to call
# trainer.model.base_model.save_pretrained(peft_model_id)

('drive/MyDrive/TAC2023/PLABA/LoRAs/FLAN-t5-xl/results/tokenizer_config.json',
 'drive/MyDrive/TAC2023/PLABA/LoRAs/FLAN-t5-xl/results/special_tokens_map.json',
 'drive/MyDrive/TAC2023/PLABA/LoRAs/FLAN-t5-xl/results/tokenizer.json')

### Evaluate & run Inference with LoRA FLAN-T5

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc.
peft_model_id = "drive/MyDrive/TAC2023/PLABA/LoRAs/FLAN-t5-xl/results"
config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")




Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Peft model loaded


In [None]:

predictions = []
for sample in tokenized_dataset["validation"]:
  output = model.generate(input_ids=torch.tensor([sample["input_ids"]]), max_new_tokens=30)
  untok_output = tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens=True)[0]
  print(len(predictions)+1, untok_output)
  predictions.append(untok_output)


### Evaluation

In [None]:
# EASSE Installation
! pip install --upgrade setuptools --quiet
! pip install git+https://github.com/feralvam/easse.git --quiet
from easse.cli import evaluate_system_output # Test if download was successful

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


ModuleNotFoundError: ignored

In [None]:
# import evaluate
# import numpy as np
# from easse.cli import evaluate_system_output
# from easse.sari import corpus_sari
# from easse.bertscore import corpus_bertscore

sources, references = tokenized_dataset["validation"]["input_ids"], tokenized_dataset["validation"]["labels"]
print(sources[122])
print(references[122])
print(b)

a = [21603, 10, 37, 25314, 17890, 3178, 41, 439, 308, 61, 65, 118, 4234, 11411, 12, 2665, 224, 1124, 12, 3, 14177, 4526, 5, 1]
b = [[37, 25314, 17890, 3178, 6, 3, 9, 731, 18, 1720, 115, 87, 6739, 18, 6589, 3178, 6, 65, 118, 4234, 261, 12, 2665, 224, 1124, 5, 1], [37, 25314, 17890, 3178, 41, 439, 308, 61, 41, 9, 731, 18, 1720, 115, 3178, 61, 65, 4234, 4260, 175, 1124, 12, 128, 1504, 5, 1]]
print(sources[122] == a)
# bleu = evaluate.load("bleu")
# rouge = evaluate.load('rouge')
# bleu_eval = bleu.compute(predictions=predictions, references=references)
# rouge_eval = rouge.compute(predictions=predictions, references=references)
# sari_easse = corpus_sari(sources, predictions, np.array(references).T, tokenizer='13a', lowercase=True)
# _, _, bertscore_easse = corpus_bertscore(predictions, references, tokenizer='13a', lowercase=True)
# scores = {
#     "BLEU": round(bleu_eval['bleu']*100,2),
#     "ROUGE-1": round(rouge_eval['rouge1']*100,2),
#     "ROUGE-2": round(rouge_eval['rouge2']*100,2),
#     "ROUGE-L": round(rouge_eval['rougeL']*100,2),
#     "SARI": round(sari_easse, 2),
#     "BERTScore": round(bertscore_easse*100, 2),
# }
# print("Evaluations terminated:\n", scores)

[21603, 10, 37, 25314, 17890, 3178, 41, 439, 308, 61, 65, 118, 4234, 11411, 12, 2665, 224, 1124, 12, 3, 14177, 4526, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[[37, 25314, 17890, 3178, 6, 3, 9, 731, 18, 1720, 115, 87, 6739, 18, 6589, 3178, 6, 65, 118, 4234, 261, 12, 2665, 224, 1124, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [37, 25314, 17890, 3178, 41, 439, 308, 61, 41, 9, 731, 18, 1720, 115, 3178, 61, 65, 4234, 4260, 175, 1124, 12, 128, 1504, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[37, 25314, 17890, 3178, 6, 3, 9, 731, 18, 1720, 115, 87, 6739, 18, 6589, 3178, 6, 65, 118, 4234, 261, 12, 2665, 224, 1124, 5, 1], [37, 25314, 17890, 3178, 41, 439, 308, 61, 41, 9, 731, 18, 1720, 115, 3178, 61, 65, 4