<a href="https://colab.research.google.com/github/MarcosVeniciu/HotelQA-RAG/blob/main/Fine_tuning_code/FineTuningTinyLlama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
from transformers.trainer_utils import get_last_checkpoint
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
import torch
import os

In [None]:
project_path = "TinyLlama" # Nome da pasta do projeto
salvar_GDrive = False # Se True, salva os checkpoints no Google Drive

In [None]:
if salvar_GDrive:
  # Mount Google Drive
  from google.colab import drive
  drive.mount('/content/drive')

In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
  model_name = "unsloth/tinyllama-bnb-4bit",
  max_seq_length = max_seq_length,
  dtype = dtype,
  load_in_4bit = load_in_4bit,
)

print(f"\n{model.num_parameters():,}")

In [None]:
model = FastLanguageModel.get_peft_model(
  model,
  r = 96, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
  target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
  lora_alpha = 32,
  lora_dropout = 0, # Currently only supports dropout = 0
  bias = "none",    # Currently only supports bias = "none"
  use_gradient_checkpointing = False, # @@@ IF YOU GET OUT OF MEMORY - set to True @@@
  random_state = 3407,
  use_rslora = False,  # We support rank stabilized LoRA
  loftq_config = None, # And LoftQ
)
print(f"\n{model.num_parameters():,}")

In [None]:
ALPACA_PROMPT_DICT = {
  "prompt_context": (
    "{bos_token}Below is an instruction that describes a task, paired with an input that provides further context."
    "Only respond with exact facts from context."
    "Write a response between <res> and </res>.\n\n"
    "### Instruction:\n"
    "{instruction}\n\n"
    "### Input:\n"
    "{input}\n\n"
    "### Response:\n"
    "<res>{response}</res>{eos_token}"
  )
}

BOS_TOKEN = tokenizer.bos_token
EOS_TOKEN = tokenizer.eos_token
def formatPrompt(example):
  prompt = ALPACA_PROMPT_DICT['prompt_context'].format(
    bos_token=BOS_TOKEN,
    instruction=f"respond: {example['Question']}",
    input=f"context: {example['Context']}",
    response=example['Answer'],
    eos_token=EOS_TOKEN
  )

  return {"text": prompt}


# downlad dataset
if not os.path.exists('Dataset_V3_train_9.5k.parquet'):
  !wget https://github.com/MarcosVeniciu/HotelQA-RAG/raw/main/Dataset/9,5K/Dataset_V3_train_9.5k.parquet
  !wget https://github.com/MarcosVeniciu/HotelQA-RAG/raw/main/Dataset/9,5K/Dataset_V3_test_9.5k.parquet

dataset = load_dataset('parquet', data_files={'train': 'Dataset_V3_train_9.5k.parquet',
                                              'test': 'Dataset_V3_test_9.5k.parquet'})
# Aplicando a função formatPrompt ao dataset
dataset = dataset.map(formatPrompt, remove_columns=['Context', 'Question', 'Answer'])

In [None]:
if os.path.exists("/content/drive"):
  project_dir = os.path.join("/content/drive/MyDrive/Treinamento", project_path)
else:
  project_dir = project_path
print(f"Project diretory: {project_dir}")

num_epocas = 8

trainer = SFTTrainer(
  model = model,
  tokenizer = tokenizer,
  train_dataset = dataset['train'].shuffle(),
  dataset_text_field = "text",
  max_seq_length = max_seq_length,
  dataset_num_proc = 2,
  packing = True, # Packs short sequences together to save time!
  args = TrainingArguments(
    num_train_epochs = num_epocas,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    save_total_limit=2,
    save_strategy="steps",
    save_steps=50,
    warmup_ratio = 0.4,
    learning_rate = 2e-5,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 10,
    optim = "adamw_8bit",
    weight_decay = 0.1,
    lr_scheduler_type = "cosine",
    seed = 3407,
    output_dir=project_dir,
  ),
)

In [None]:
#se tiver algum checkpoint salvo na pasta do projeto, ele vai continuar a partir do ultimo salvo.
last_checkpoint = get_last_checkpoint(project_dir)
if last_checkpoint != None: # Continua a partir do ultimo checkpoint salvo
  print(f"Continuando treinamento a partir de: {last_checkpoint}\n")
  trainer_stats = trainer.train(resume_from_checkpoint=last_checkpoint)
else: # começa um novo treinamento
  print("Começando um novo treinamento:")
  trainer_stats = trainer.train()

trainer.save_model(os.path.join(project_dir, "gemma_ft_saved"))

In [None]:
lora_local = "/content/TinyLlama/checkpoint-440"

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

modelos = AutoPeftModelForCausalLM.from_pretrained(
  lora_local, # YOUR MODEL YOU USED FOR TRAINING
  load_in_4bit = load_in_4bit,
)
tokenizer = AutoTokenizer.from_pretrained(lora_local)

print(f"\nTotal de Parâmetros: {modelos.num_parameters():,}")

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(lora_local)
print(f"\nTotal de Parâmetros: {base_model.num_parameters():,}")

**Coisas a serem observadas**

1 teste \
apos eu ttreinar por 26 epocas, respondeu mais ou menos bem para a tarefa, tanto quando o modelo usado era o mesmo que foi treinado, como o qando carrega ele a partir do modelo salvo. \
Mas ele perdeu a capacidade de responder a outros tipos de tarefas.
Quando eu carrego o modelo a partir do modelo_base e mesclo ele com o modelo salvo, ele da uma resposta curta e fica repetindo todo o prompt em loop, porem ele não perdeu a capacidade de resolver outras tarefas.

--------------------------

2 teste \
Para o segundo teste, vou manter o uso do checkpoint, porem vou mudar a forma como uso os tokens s e /s, para ver ser isso mouda algo.


In [None]:
contexto = "Hotel Fasano, São Paulo. This luxury hotel, example located in the Jardins neighborhood, offers elegant accommodations with stunning city views. The hotel features a pool, spa and wellness center, gourmet restaurant, fitness center, and 24-hour reception."
pergunta = "Does Hotel Fasano have a pool available for guests?"
pergunta2 = "Does Hotel Fasano have a free wi-fi available for guests?"

ALPACA_PROMPT_DICT = {
  "prompt_context": (
    "Below is an instruction that describes a task, paired with an input that provides further context."
    "Write a response that appropriately completes the request."
    "Write a response between <res> and </res>.\n\n"
    "### Instruction:\n"
    "{instruction}\n\n"
    "### Input:\n"
    "{input}\n\n"
    "### Response:\n"
    "<res>"
  )
}


In [None]:
# Inferencia com o modelo apos o treinamento
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
  instruction=f"respond: {pergunta}",
  input=f"context: {contexto}")

], return_tensors = "pt").to(model.device)

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)

In [None]:
# Inferencia com o modelo apos o treinamento
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
  instruction=f"respond: {pergunta2}",
  input=f"context: {contexto}")

], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)

In [None]:
# Inferencia com o modelo apos o treinamento
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
    instruction="Continue the fibonnaci sequence.",
    input=f"1, 1, 2, 3, 5, 8"    )

], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)

In [None]:
# Inferencia com o modelo carregando o modelo salvo
if True:
  from unsloth import FastLanguageModel
  modelo, tokenizero = FastLanguageModel.from_pretrained(
    model_name = "/content/drive/MyDrive/Treinamento/TinyLlama_T/gemma_ft_saved", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
  )
  FastLanguageModel.for_inference(modelo) # Enable native 2x faster inference


inputs = tokenizero(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
  instruction=f"respond: {pergunta}",
  input=f"context: {contexto}")
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizero)
_ = modelo.generate(**inputs, streamer = text_streamer, max_new_tokens = 120)

In [None]:
# Inferencia com o modelo a partir do modelo salvo
FastLanguageModel.for_inference(modelo) # Enable native 2x faster inference
inputs = tokenizero(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
    instruction="Continue the fibonnaci sequence.",
    input=f"1, 1, 2, 3, 5, 8"    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizero)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

In [None]:
# Inferencia com o modelo, mesclando o modelo base com o modelo salvo
from peft import get_peft_model, PeftConfig

# Carregar o modelo base
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


modele, tokenizere = FastLanguageModel.from_pretrained(
  model_name = "unsloth/tinyllama-bnb-4bit",
  max_seq_length = max_seq_length,
  dtype = dtype,
  load_in_4bit = load_in_4bit,
)

# Carregar os adaptadores LoRA
lora_path = "/content/drive/MyDrive/Treinamento/TinyLlama_T/gemma_ft_saved"
# Carregar a configuração dos adaptadores LoRA
peft_config = PeftConfig.from_pretrained(lora_path)

# Combinar o modelo base com os adaptadores LoRA
modele = get_peft_model(modele, peft_config)
FastLanguageModel.for_inference(modele) # Enable native 2x faster inference


In [None]:
# Inferencia com o modelo apos o treinamento
FastLanguageModel.for_inference(modele) # Enable native 2x faster inference
inputs = tokenizere(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
  instruction=f"respond: {pergunta}",
  input=f"context: {contexto}")
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizere)
_ = modele.generate(**inputs, streamer = text_streamer, max_new_tokens = 120)

In [None]:
# Inferencia com o modelo apos o treinamento
FastLanguageModel.for_inference(modele) # Enable native 2x faster inference
inputs = tokenizere(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
    instruction="Continue the fibonnaci sequence.",
    input=f"1, 1, 2, 3, 5, 8"    )

], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizere)
_ = modele.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)

**REFERENCIAS**

[1] [Colab original com tinyLlama](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing#scrollTo=95_Nn-89DhsL)