<a href="https://colab.research.google.com/github/MarcosVeniciu/HotelQA-RAG/blob/main/Fine_tuning_code/FineTuningTinyLlama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
from transformers.trainer_utils import get_last_checkpoint
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
import torch
import os

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
  model_name = "unsloth/tinyllama-bnb-4bit",
  max_seq_length = max_seq_length,
  dtype = dtype,
  load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth: Fast Llama patching release 2024.8
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
model = FastLanguageModel.get_peft_model(
  model,
  r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
  target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
  lora_alpha = 32,
  lora_dropout = 0, # Currently only supports dropout = 0
  bias = "none",    # Currently only supports bias = "none"
  use_gradient_checkpointing = False, # @@@ IF YOU GET OUT OF MEMORY - set to True @@@
  random_state = 3407,
  use_rslora = False,  # We support rank stabilized LoRA
  loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


In [5]:
ALPACA_PROMPT_DICT = {
  "prompt_context": (
    "{bos_token}Below is an instruction that describes a task, paired with an input that provides further context. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction: # Instruction on how to perform the task\n"
    "{instruction}\n\n"
    "### Input: # Context and question\n"
    "{input}\n\n"
    "### Response: # Model should generate the response here \n"
    "{bos_token}{response}{eos_token}"
  ),
  "prompt_no_context": (
    "{bos_token}Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction: # Instruction on how to perform the task\n"
    "{instruction}\n\n"
    "### Input: # Context and question\n"
    "{input}\n\n"
    "### Response: # Model should generate the response here \n"
    "{bos_token}{response}{eos_token}"
  ),
}

BOS_TOKEN = tokenizer.bos_token
EOS_TOKEN = tokenizer.eos_token
def formatPrompt(example):
  if example['Context'] == 'No context':
    prompt = ALPACA_PROMPT_DICT['prompt_no_context'].format(
      bos_token=BOS_TOKEN,
      instruction="If the context is 'no context', respond with a generic answer indicating that don't know the hotel.",
      input=f"context: {example['Context']}\nrespond: {example['Question']}",
      response=example['Answer'],
      eos_token=EOS_TOKEN
    )
  else:
    if 'Yes, ' in example['Answer']:
      prompt = ALPACA_PROMPT_DICT['prompt_context'].format(
        bos_token=BOS_TOKEN,
        instruction="When prompted to 'respond', simply generate the answer to the question asked in 'respond' and make sure your answer is detailed, informative and engaging, using the information in context to enhance your answer.",
        input=f"context: {example['Context']}\nrespond: {example['Question']}",
        response=example['Answer'],
        eos_token=EOS_TOKEN
      )
    else:
      prompt = ALPACA_PROMPT_DICT['prompt_context'].format(
        bos_token=BOS_TOKEN,
        instruction="When prompted to 'respond', use the context to answer the question. If the hotel lacks a requested amenity, mention this and highlight other available amenities. Ensure the answer is detailed, informative and engaging, using the information in context to enhance your answer.",
        input=f"context: {example['Context']}\nrespond: {example['Question']}",
        response=example['Answer'],
        eos_token=EOS_TOKEN
      )
  return {"text": prompt}


# downlad dataset
if not os.path.exists('Dataset_V2_train_16k.parquet'):
  !wget https://github.com/MarcosVeniciu/HotelQA-RAG/raw/main/Dataset/16k/Dataset_V2_train_16k.parquet
  !wget https://github.com/MarcosVeniciu/HotelQA-RAG/raw/main/Dataset/16k/Dataset_V2_test_16k.parquet

dataset = load_dataset('parquet', data_files={'train': 'Dataset_V2_train_16k.parquet',
                                              'test': 'Dataset_V2_test_16k.parquet'})
# Aplicando a função formatPrompt ao dataset
dataset = dataset.map(formatPrompt, remove_columns=['Context', 'Question', 'Answer'])

In [6]:
if os.path.exists("/content/drive"):
  project_dir = "/content/drive/MyDrive/Treinamento/TinyLlama"
else:
  project_dir = "TinyLlama"


trainer = SFTTrainer(
  model = model,
  tokenizer = tokenizer,
  train_dataset = dataset['train'].shuffle(),
  dataset_text_field = "text",
  max_seq_length = max_seq_length,
  dataset_num_proc = 2,
  packing = True, # Packs short sequences together to save time!
  args = TrainingArguments(
    num_train_epochs = 25,
    per_device_train_batch_size = 3,
    gradient_accumulation_steps = 3,
    save_total_limit=2,
    save_strategy="steps",
    save_steps=193,
    warmup_ratio = 0.1,
    learning_rate = 2e-5,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 20,
    optim = "adamw_8bit",
    weight_decay = 0.1,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir=project_dir,
  ),
)

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
#se tiver algum checkpoint salvo na pasta do projeto, ele vai continuar a partir do ultimo salvo.
last_checkpoint = get_last_checkpoint(project_dir)
if last_checkpoint != None: # Continua a partir do ultimo checkpoint salvo
  print(f"Continuando treinamento a partir de: {last_checkpoint}\n")
  trainer_stats = trainer.train(resume_from_checkpoint=last_checkpoint)
else: # começa um novo treinamento
  print("Começando um novo treinamento:")
  trainer_stats = trainer.train()

# Salva o LoRA adapters ao final do treinamento
model.save_pretrained(project_dir + "/lora_model") # Local saving
tokenizer.save_pretrained(project_dir + "/lora_model")

Continuando treinamento a partir de: /content/drive/MyDrive/Treinamento/TinyLlama/checkpoint-4536



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,705 | Num Epochs = 25
O^O/ \_/ \    Batch size per device = 3 | Gradient Accumulation steps = 3
\        /    Total batch size = 9 | Total steps = 4,725
 "-____-"     Number of trainable parameters = 25,231,360


Step,Training Loss
4540,0.2736
4560,0.1756
4580,0.1414
4600,0.1402
4620,0.1302
4640,0.1359
4660,0.1331
4680,0.1327
4700,0.132
4720,0.1302


('/content/drive/MyDrive/Treinamento/TinyLlama/lora_model/tokenizer_config.json',
 '/content/drive/MyDrive/Treinamento/TinyLlama/lora_model/special_tokens_map.json',
 '/content/drive/MyDrive/Treinamento/TinyLlama/lora_model/tokenizer.model',
 '/content/drive/MyDrive/Treinamento/TinyLlama/lora_model/added_tokens.json',
 '/content/drive/MyDrive/Treinamento/TinyLlama/lora_model/tokenizer.json')

**Coisas a serem observadas**

1 teste \
apos eu ttreinar por 25 epocas, respondeu mais ou menos bem para a tarefa, tanto quando o modelo usado era o mesmo que foi treinado, como o qando carrega ele a partir do modelo salvo. \
Mas ele perdeu a capacidade de responder a outros tipos de tarefas.
Quando eu carrego o modelo a partir do modelo_base e mesclo ele com o modelo salvo, ele da uma resposta curta e fica repetindo todo o prompt em loop, porem ele não perdeu a capacidade de resolver outras tarefas.

--------------------------

2 teste \
Para o segundo teste, vou manter o uso do checkpoint, porem vou mudar a forma como uso os tokens s e /s, para ver ser isso mouda algo.


-------------------------

3 teste \
vou treinar o modelo por 3h e ver se isso muda algo. (suspeita é que usar os checkpoint esteja atrapalhando, mas deve ser outra coisa)

In [7]:
contexto = "Hotel Fasano, São Paulo. This luxury hotel, example located in the Jardins neighborhood, offers elegant accommodations with stunning city views. The hotel features a pool, spa and wellness center, gourmet restaurant, fitness center, and 24-hour reception."
pergunta = "Does Hotel Fasano have a pool available for guests?"

In [12]:
# Inferencia com o modelo apos o treinamento
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    ALPACA_PROMPT_DICT['prompt_context'].format(
        bos_token="",
        instruction="When prompted to 'respond', use the context to answer the question. If the hotel lacks a requested amenity, mention this and highlight other available amenities. Ensure the answer is detailed, informative and engaging, using the information in context to enhance your answer.",
        input=f"context: {contexto}\nrespond: {pergunta}",
        response="",
        eos_token=""
      )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: # Instruction on how to perform the task
When prompted to 'respond', use the context to answer the question. If the hotel lacks a requested amenity, mention this and highlight other available amenities. Ensure the answer is detailed, informative and engaging, using the information in context to enhance your answer.

### Input: # Context and question
context: Hotel Fasano, São Paulo. This luxury hotel, example located in the Jardins neighborhood, offers elegant accommodations with stunning city views. The hotel features a pool, spa and wellness center, gourmet restaurant, fitness center, and 24-hour reception.
respond: Does Hotel Fasano have a pool available for guests?

### Response: # Model should generate the response here 
<s> Although Hotel Fasano does not have guests, it does offer This luxury hotel, loc

In [25]:
# Inferencia com o modelo carregando o modelo salvo
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/Treinamento/TinyLlama/lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    ALPACA_PROMPT_DICT['prompt_context'].format(
        bos_token="",
        instruction="When prompted to 'respond', use the context to answer the question. If the hotel lacks a requested amenity, mention this and highlight other available amenities. Ensure the answer is detailed, informative and engaging, using the information in context to enhance your answer.",
        input=f"context: {contexto}\nrespond: {pergunta}",
        response="",
        eos_token=""
      )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 120)

==((====))==  Unsloth: Fast Llama patching release 2024.8
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: # Instruction on how to perform the task
When prompted to 'respond', use the context to answer the question. If the hotel lacks a requested amenity, mention this and highlight other available amenities. Ensure the answer is detailed, informative and engaging, using the information in context to enhance your answer.

### Input: # Context and question
context: Hotel Fasano, São Paulo. This l

In [24]:
# Inferencia com o modelo a partir do modelo salvo
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    ALPACA_PROMPT_DICT['prompt_context'].format(
        bos_token="",
        instruction="Continue the fibonnaci sequence.",
        input=f"1, 1, 2, 3, 5, 8",
        response="",
        eos_token=""
      )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: # Instruction on how to perform the task
Continue the fibonnaci sequence.

### Input: # Context and question
1, 1, 2, 3, 5, 8

### Response: # Model should generate the response here 
I don't know the hotel you are looking for, please provide a hotel in Cabo Frio, Rio de Janeiro, Brazil.</s>


In [3]:
# Inferencia com o modelo, mesclando o modelo base com o modelo salvo
from peft import get_peft_model, PeftConfig

# Carregar o modelo base
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
  model_name = "unsloth/tinyllama-bnb-4bit",
  max_seq_length = max_seq_length,
  dtype = dtype,
  load_in_4bit = load_in_4bit,
)

# Carregar os adaptadores LoRA
lora_path = "/content/drive/MyDrive/Treinamento/TinyLlama/lora_model"
# Carregar a configuração dos adaptadores LoRA
peft_config = PeftConfig.from_pretrained(lora_path)

# Combinar o modelo base com os adaptadores LoRA
model = get_peft_model(model, peft_config)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference


==((====))==  Unsloth: Fast Llama patching release 2024.8
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [8]:
inputs = tokenizer(
[
    ALPACA_PROMPT_DICT['prompt_context'].format(
        bos_token="",
        instruction="When prompted to 'respond', use the context to answer the question. If the hotel lacks a requested amenity, mention this and highlight other available amenities. Ensure the answer is detailed, informative and engaging, using the information in context to enhance your answer.",
        input=f"context: {contexto}\nrespond: {pergunta}",
        response="",
        eos_token=""
      )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 300)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: # Instruction on how to perform the task
When prompted to 'respond', use the context to answer the question. If the hotel lacks a requested amenity, mention this and highlight other available amenities. Ensure the answer is detailed, informative and engaging, using the information in context to enhance your answer.

### Input: # Context and question
context: Hotel Fasano, São Paulo. This luxury hotel, example located in the Jardins neighborhood, offers elegant accommodations with stunning city views. The hotel features a pool, spa and wellness center, gourmet restaurant, fitness center, and 24-hour reception.
respond: Does Hotel Fasano have a pool available for guests?

### Response: # Model should generate the response here 
Yes, the hotel has a pool available for guests.

### Instruction: # Instruction on h

In [None]:
inputs = tokenizer(
[
    ALPACA_PROMPT_DICT['prompt_context'].format(
        bos_token="",
        instruction="Continue the fibonnaci sequence.",
        input=f"1, 1, 2, 3, 5, 8",
        response="",
        eos_token=""
      )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2050)

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: # Instruction on how to perform the task
Continue the fibonnaci sequence.

### Input: # Context and question
1, 1, 2, 3, 5, 8

### Response: # Model should generate the response here 
1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17763, 28672, 46368, 75025, 121665, 196418, 318511, 518496, 832040, 1310720, 2097152, 3374503, 5184960, 8320400, 13107200, 20971520, 33745030, 51849600, 83204000, 131072000, 209715200, 337450300, 518496000, 832040000, 1310720000, 2097152000, 3374503000, 5184960000, 8320400000, 13107200000, 20971520000, 33745030000, 51849600000, 83204000000, 131072000000, 209715200000, 337450300000, 518496000000, 832040000000, 1310720000000, 2097152000000, 3374503000000, 5184960000000, 8320400000000, 13107200000000, 20971520000000, 33745030000000, 518496000

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


**REFERENCIAS**

[1] [Colab original com tinyLlama](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing#scrollTo=95_Nn-89DhsL)