<a href="https://colab.research.google.com/github/MarcosVeniciu/HotelQA-RAG/blob/main/Fine_tuning_code/Fine_Tuning_Gemma_2_9B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [1]:
from transformers.trainer_utils import get_last_checkpoint
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
import torch
import os

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
project_path = "Gemma_2" # Nome da pasta do projeto
salvar_GDrive = False # Se True, salva os checkpoints no Google Drive

In [None]:
if salvar_GDrive:
  # Mount Google Drive
  from google.colab import drive
  drive.mount('/content/drive')

In [3]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
  model_name = "unsloth/gemma-2-9b-it-bnb-4bit",
  max_seq_length = max_seq_length,
  dtype = dtype,
  load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth: Fast Gemma2 patching release 2024.8
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


In [10]:
# Adicionar tokens especiais
special_tokens_dict = {'additional_special_tokens': ['<context>', '<end_of_context>', '<question>', '<end_of_question>', '<res>', '<end_of_res>', '<system>', '<end_of_system>']}
tokenizer.add_special_tokens(special_tokens_dict)

# Verificar se os tokens foram adicionados
print("Tokens Especiais Adicionados:", tokenizer.additional_special_tokens)

# Expandir a matriz de embeddings para incluir os novos tokens
model.to('cpu')
model.resize_token_embeddings(len(tokenizer))
model.to('cuda')


# Função para formatar o prompt no estilo do chat template
def formatPrompt(example):
    instrucao = "You are an assistant who answers questions based on the given context. Generate an answer to a question contained in <question> based on the context in <context>. Enclose your answer between <res> and </res>."
    prompt = (

        f"<bos><start_of_turn>user\n"
        f"<system>{instrucao}<\system>\n<context>{example['Context']}</context>\n<question>{example['Question']}</question><end_of_turn>\n"
        f"<start_of_turn>model\n"
        f"<res>{example['Answer']}</res><end_of_turn><eos>"
    )
    return {"text": prompt}


# downlad dataset
if not os.path.exists('Dataset_V2_train_16k.parquet'):
  !wget https://github.com/MarcosVeniciu/HotelQA-RAG/raw/main/Dataset/16k/Dataset_V2_train_16k.parquet
  !wget https://github.com/MarcosVeniciu/HotelQA-RAG/raw/main/Dataset/16k/Dataset_V2_test_16k.parquet

dataset = load_dataset('parquet', data_files={'train': 'Dataset_V2_train_16k.parquet',
                                              'test': 'Dataset_V2_test_16k.parquet'})
# Aplicando a função formatPrompt ao dataset
dataset = dataset.map(formatPrompt, remove_columns=['Context', 'Question', 'Answer'])

Tokens Especiais Adicionados: ['<context>', '<end_of_context>', '<question>', '<end_of_question>', '<res>', '<end_of_res>', '<system>', '<end_of_system>']


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [6]:
if os.path.exists("/content/drive"):
  project_dir = os.path.join("/content/drive/MyDrive/Treinamento", project_path)
else:
  project_dir = project_path
print(f"Project diretory: {project_dir}")

num_epocas = 1

trainer = SFTTrainer(
  model = model,
  tokenizer = tokenizer,
  train_dataset = dataset['train'].shuffle(),
  dataset_text_field = "text",
  max_seq_length = max_seq_length,
  dataset_num_proc = 2,
  packing = False, # Packs short sequences together to save time!
  args = TrainingArguments(
    num_train_epochs = num_epocas,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 16,
    save_total_limit=2,
    save_strategy="steps",
    save_steps=6,
    warmup_ratio = 0.4,
    learning_rate = 2e-5,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.1,
    lr_scheduler_type = "cosine",
    seed = 3407,
    output_dir=project_dir,
  ),
)

Project diretory: Gemma_2


Map (num_proc=2):   0%|          | 0/16557 [00:00<?, ? examples/s]

In [7]:
#se tiver algum checkpoint salvo na pasta do projeto, ele vai continuar a partir do ultimo salvo.
last_checkpoint = get_last_checkpoint(project_dir)
if last_checkpoint != None: # Continua a partir do ultimo checkpoint salvo
  print(f"Continuando treinamento a partir de: {last_checkpoint}\n")
  trainer_stats = trainer.train(resume_from_checkpoint=last_checkpoint)
else: # começa um novo treinamento
  print("Começando um novo treinamento:")
  trainer_stats = trainer.train()

trainer.save_model(os.path.join(project_dir, "gemma_ft_saved"))

Começando um novo treinamento:


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,557 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 16
\        /    Total batch size = 16 | Total steps = 1,034
 "-____-"     Number of trainable parameters = 54,018,048


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
instrucao = "You are an assistant who answers questions based on the given context. Generate an answer to a question contained in <question> based on the context in <context>. Enclose your answer between <res> and </res>."

contexto = "Hotel Fasano, São Paulo. This luxury hotel, example located in the Jardins neighborhood, offers elegant accommodations with stunning city views. The hotel features a pool, spa and wellness center, gourmet restaurant, fitness center, and 24-hour reception."
pergunta = "Does Hotel Fasano have a pool available for guests?"
pergunta2 = "Does Hotel Fasano have a free wi-fi available for guests?"

In [None]:
# Inferencia com o modelo apos o treinamento
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

chat = [
    { "role": "user", "content": f"<system>{instrucao}<\system>\n<context>{contexto}</context>\n<question>{pergunta}</question>" },
    { "role": "model", "content": f"<res>isso é uma ersposta</res>"}
]

inputs = tokenizer([tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 250)

**REFERENCIAS**

[1] [Colab original com Gemma 2 9B](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing#scrollTo=QmUBVEnvCDJv)