<a href="https://colab.research.google.com/github/MarcosVeniciu/HotelQA-RAG/blob/main/Fine_tuning_code/FineTuningTinyLlama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
from transformers.trainer_utils import get_last_checkpoint
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
import torch
import os

In [3]:
project_path = "TinyLlama" # Nome da pasta do projeto
salvar_GDrive = False # Se True, salva os checkpoints no Google Drive

In [4]:
if salvar_GDrive:
  # Mount Google Drive
  from google.colab import drive
  drive.mount('/content/drive')

In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
  model_name = "unsloth/tinyllama-bnb-4bit",
  max_seq_length = max_seq_length,
  dtype = dtype,
  load_in_4bit = load_in_4bit,
)

print(f"\nTotal de parâmetros: {model.num_parameters():,}")

In [None]:
model = FastLanguageModel.get_peft_model(
  model,
  r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
  target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
  lora_alpha = 32,
  lora_dropout = 0, # Currently only supports dropout = 0
  bias = "none",    # Currently only supports bias = "none"
  use_gradient_checkpointing = False, # @@@ IF YOU GET OUT OF MEMORY - set to True @@@
  random_state = 3407,
  use_rslora = False,  # We support rank stabilized LoRA
  loftq_config = None, # And LoftQ
)
print(f"\nTotal de parâmetros com LoRa: {model.num_parameters():,}")

In [7]:
ALPACA_PROMPT_DICT = {
  "prompt_context": (
    "{bos_token}Below is an instruction that describes a task, paired with an input that provides further context."
    "Only respond with exact facts from Input."
    "Write a response between <res> and </res>.\n\n"
    "### Instruction:\n"
    "{instruction}\n\n"
    "### Input:\n"
    "{input}\n\n"
    "### Response:\n"
    "<res>{response}</res>{eos_token}"
  )
}

BOS_TOKEN = tokenizer.bos_token
EOS_TOKEN = tokenizer.eos_token
def formatPrompt(example):
  prompt = ALPACA_PROMPT_DICT['prompt_context'].format(
    bos_token=BOS_TOKEN,
    instruction=f"respond: {example['Question']}",
    input=example['Context']",
    response=example['Answer'],
    eos_token=EOS_TOKEN
  )

  return {"text": prompt}


# downlad dataset
if not os.path.exists('Dataset_V3_train_9.5k.parquet'):
  !wget https://github.com/MarcosVeniciu/HotelQA-RAG/raw/main/Dataset/9,5K/Dataset_V3_train_9.5k.parquet
  !wget https://github.com/MarcosVeniciu/HotelQA-RAG/raw/main/Dataset/9,5K/Dataset_V3_test_9.5k.parquet

dataset = load_dataset('parquet', data_files={'train': 'Dataset_V3_train_9.5k.parquet',
                                              'test': 'Dataset_V3_test_9.5k.parquet'})
# Aplicando a função formatPrompt ao dataset
dataset = dataset.map(formatPrompt, remove_columns=['Context', 'Question', 'Answer'])

In [None]:
if os.path.exists("/content/drive"):
  project_dir = os.path.join("/content/drive/MyDrive/Treinamento", project_path)
else:
  project_dir = project_path
print(f"Project diretory: {project_dir}")

num_epocas = 8

trainer = SFTTrainer(
  model = model,
  tokenizer = tokenizer,
  train_dataset = dataset['train'].shuffle(),
  dataset_text_field = "text",
  max_seq_length = max_seq_length,
  dataset_num_proc = 2,
  packing = True, # Packs short sequences together to save time!
  args = TrainingArguments(
    num_train_epochs = num_epocas,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    save_total_limit=2,
    save_strategy="steps",
    save_steps=50,
    warmup_ratio = 0.4,
    learning_rate = 2e-5,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 10,
    optim = "adamw_8bit",
    weight_decay = 0.1,
    lr_scheduler_type = "cosine",
    seed = 3407,
    output_dir=project_dir,
  ),
)

In [9]:
#se tiver algum checkpoint salvo na pasta do projeto, ele vai continuar a partir do ultimo salvo.
last_checkpoint = get_last_checkpoint(project_dir)
if last_checkpoint != None: # Continua a partir do ultimo checkpoint salvo
  print(f"Continuando treinamento a partir de: {last_checkpoint}\n")
  trainer_stats = trainer.train(resume_from_checkpoint=last_checkpoint)
else: # começa um novo treinamento
  print("Começando um novo treinamento:")
  trainer_stats = trainer.train()

trainer.save_model(os.path.join(project_dir, "tinyllama_saved"))

Step,Training Loss
10,1.357
20,1.3547
30,1.3086
40,1.2542
50,1.1892
60,1.1389
70,1.0698
80,0.9978
90,0.9253
100,0.8432


In [8]:
contexto = "Hotel Fasano, São Paulo. This luxury hotel, example located in the Jardins neighborhood, offers elegant accommodations with stunning city views. The hotel features a pool, spa and wellness center, gourmet restaurant, fitness center, and 24-hour reception."
pergunta = "Does Hotel Fasano have a pool available for guests?"
pergunta2 = "Does Hotel Fasano have a free wi-fi available for guests?"

ALPACA_PROMPT_DICT = {
  "prompt_context": (
    "Below is an instruction that describes a task, paired with an input that provides further context."
    "Only respond with exact facts from Input."
    "Write a response between <res> and </res>.\n\n"
    "### Instruction:\n"
    "{instruction}\n\n"
    "### Input:\n"
    "{input}\n\n"
    "### Response:\n"
    "<res>"
  )
}


In [11]:
# Inferencia com o modelo apos o treinamento
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
  instruction=f"respond: {pergunta}",
  input=f"{contexto}")

], return_tensors = "pt").to(model.device)

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)

<s> Below is an instruction that describes a task, paired with an input that provides further context.Write a response that appropriately completes the request.Write a response between <res> and </res>.

### Instruction:
respond: Does Hotel Fasano have a pool available for guests?

### Input:
context: Hotel Fasano, São Paulo. This luxury hotel, example located in the Jardins neighborhood, offers elegant accommodations with stunning city views. The hotel features a pool, spa and wellness center, gourmet restaurant, fitness center, and 24-hour reception.

### Response:
<res>Yes, Hotel Fasano has a pool available for guests, providing an opportunity to relax and enjoy the beautiful surroundings.</res></s>


In [12]:
# Inferencia com o modelo apos o treinamento
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
  instruction=f"respond: {pergunta2}",
  input=f"context: {contexto}")

], return_tensors = "pt").to(model.device)

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)

<s> Below is an instruction that describes a task, paired with an input that provides further context.Write a response that appropriately completes the request.Write a response between <res> and </res>.

### Instruction:
respond: Does Hotel Fasano have a free wi-fi available for guests?

### Input:
context: Hotel Fasano, São Paulo. This luxury hotel, example located in the Jardins neighborhood, offers elegant accommodations with stunning city views. The hotel features a pool, spa and wellness center, gourmet restaurant, fitness center, and 24-hour reception.

### Response:
<res>Yes, Hotel Fasano offers This luxury hotel, example located in the Jardins neighborhood, offers elegant accommodations with stunning city views. This luxury hotel, example located in the Jardins neighborhood, offers elegant accommodations with stunning city views ensuring that guests can enjoy a comfortable and convenient stay.</res></s>


In [28]:
# Inferencia com o modelo apos o treinamento
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
    instruction="Continue the fibonnaci sequence.",
    input=f"1, 1, 2, 3, 5, 8"    )

], return_tensors = "pt").to(model.device)

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)

<s> Below is an instruction that describes a task, paired with an input that provides further context.Write a response that appropriately completes the request.Write a response between <res> and </res>.

### Instruction:
Continue the fibonnaci sequence.

### Input:
1, 1, 2, 3, 5, 8

### Response:
<res>Although the sequence is not complete, you can continue the sequence by adding 4 to the sequence. This will result in a total of 15, which can be found below.</res></s>


Carregando o modelo a partir do modelo salvo

In [15]:
# Inferencia com o modelo carregando o modelo salvo
from unsloth import FastLanguageModel
if True:
  model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/content/TinyLlama/tinyllama_saved", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
  )
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
print(f"\nTotal de parâmetros com LoRa: {model.num_parameters():,}\n\n")

inputs = tokenizer(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
  instruction=f"respond: {pergunta}",
  input=f"context: {contexto}")
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 120)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Total de parâmetros com LoRa: 1,150,511,104
<s> Below is an instruction that describes a task, paired with an input that provides further context.Only respond with exact facts from context.Write a response between <res> and </res>.

### Instruction:
respond: Does Hotel Fasano have a pool available for guests?

### Input:
context: Hotel Fasano, São Paulo. This luxury hotel, example located in the Jardins neighborhood, offers elegant accommodations with stunning city views. The hotel features a pool, spa and wellness center, gourmet restaurant, fitnes

In [14]:
# Inferencia com o modelo a partir do modelo salvo
inputs = tokenizer(
[
  ALPACA_PROMPT_DICT['prompt_context'].format(
    instruction="Continue the fibonnaci sequence.",
    input=f"1, 1, 2, 3, 5, 8"    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<s> Below is an instruction that describes a task, paired with an input that provides further context.Only respond with exact facts from context.Write a response between <res> and </res>.

### Instruction:
Continue the fibonnaci sequence.

### Input:
1, 1, 2, 3, 5, 8

### Response:
<res>Although the sequence is not necessarily a fibonnaci sequence, it is a good example of a sequence that can be used to calculate the sum of the first n natural numbers. The sequence starts with 1, which is the first number in the sequence, and continues with each subsequent number, which is the sum of the previous two numbers.</res></s>


**REFERENCIAS**

[1] [Colab original com tinyLlama](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing#scrollTo=95_Nn-89DhsL)