### Import das bibliotecas unsloth

In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

### Escolha do foundation model e configuração dos parâmetros

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Prep

Criação da coluna instruction no dataset

In [4]:
from datasets import load_dataset
dataset = load_dataset("rafaelpivetta/tech-challenge-fase3", split = "train")


# Função para concatenar 'Describe the product ' com o título
def add_instruction_column(example):
    example["instruction"] = f"Describe the product {example['title']}"
    return example

# Aplica a função ao dataset
dataset = dataset.map(add_instruction_column)

trn_limpo.json:   0%|          | 0.00/17.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16325 [00:00<?, ? examples/s]

Map:   0%|          | 0/16325 [00:00<?, ? examples/s]

In [5]:
print(dataset)

Dataset({
    features: ['uid', 'title', 'content', 'instruction'],
    num_rows: 16325
})


In [None]:
alpaca_prompt = """Below is an instruction to provide a description of a product. Write a simple response that best describe the product.

### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["content"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)


In [7]:
print(dataset['text'][:1])  # Exibe a saída da coluna text que será utilizada no SFTTrainer

['Below is an instruction to provide a description of a product. Write a simple response that best describe the product.\n\n### Instruction:\nDescribe the product Girls Ballet Tutu Neon Pink\n\n### Response:\nHigh quality 3 layer ballet tutu. 12 inches in length<|end_of_text|>']


In [8]:
print(dataset)

Dataset({
    features: ['uid', 'title', 'content', 'instruction', 'text'],
    num_rows: 16325
})


### Inferência antes do treino

In [8]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Describe the product Girls Ballet Tutu Neon Pink", # instruction
        "", # output - leave this blank for generation!
        # temperature=0.7,
        # top_k=0.9,
        # repetition_penalty=1.1,
    )
], return_tensors = "pt").to("cuda")

outputs_after = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs_after)

['<|begin_of_text|>Below is an instruction to provide a description of a product. Write a simple response that best describe the product.\n\n### Instruction:\nDescribe the product Girls Ballet Tutu Neon Pink\n\n### Response:\nGirls Ballet Tutu Neon Pink is a product that is made of high-quality materials and is designed to provide comfort and support for girls who participate in ballet. The tutu is made of a lightweight, stretchy fabric that is soft and comfortable against the skin. The tutu is also designed to provide support and stability, allowing girls to perform their best on the dance floor. The tutu is available in a variety of sizes and colors, making it easy to find the perfect fit for your little ballerina.<|end_of_text|>']

<a name="Train"></a>
### Treinamento do modelo

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 50,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,325 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 50
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.2091
2,2.7012
3,2.9206
4,2.3987
5,2.5716
6,2.4476
7,2.239
8,1.9897
9,2.1828
10,2.1327


<a name="Inference"></a>
### Inferência após o treino

In [15]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Describe the product Girls Ballet Tutu Neon Pink", # instruction
        "", # output - leave this blank for generation!
        # temperature=0.7,
        # top_k=0.9,
        # repetition_penalty=1.1,
    )
], return_tensors = "pt").to("cuda")

outputs_after = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs_after)

['<|begin_of_text|>Below is an instruction to provide a description of a product. Write a simple response that best describe the product.\n\n### Instruction:\nDescribe the product Girls Ballet Tutu Neon Pink\n\n### Response:\nThe tutu is a traditional ballet costume for girls. This book is part of the Tutu series, which also includes tutus in blue, pink, red, purple, and green. Each book features a tutu in a different color and provides a simple description of the garment. The books are illustrated with photographs of girls wearing the tutus. The series is appropriate for young children, but the books would be more useful if they provided more information about ballet and the tutu. For example, the text could provide a simple description of the garment, or explain the origins of the tutu. The books could also provide information about the']

 Inferência utilizando `TextStreamer`

In [16]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Describe the product Girls Ballet Tutu Neon Pink", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction to provide a description of a product. Write a simple response that best describe the product.

### Instruction:
Describe the product Girls Ballet Tutu Neon Pink

### Response:
The girls tutu is a classic ballet costume. It is a full-length, flared skirt that is worn over a pair of tights and a leotard. This particular tutu is made from a stretchy, shiny fabric that is ideal for dance, and the tutu is decorated with a neon pink applique that will help your little ballerina stand out in the crowd.<|end_of_text|>
