In [1]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [2]:
%%capture
!pip install datasets
!pip install trl
!pip install transformers
!pip install evaluate

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import json
from datasets import load_dataset, load_from_disk
from trl import SFTTrainer
from transformers import TrainingArguments, Trainer


max_seq_length = 2048
dtype = None
load_in_4bit = True
model_path = "/content/drive/MyDrive/Colab Notebooks/unsloth_model"

fourbit_models = [
     "unsloth/llama-3-8b-bnb-4bit"
]

torch.cuda.empty_cache()

In [14]:
model, tokenizer = FastLanguageModel.from_pretrained(model_path)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [15]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=max_seq_length
    )

    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

In [17]:
VALIDATION_PATH_DATASET = "/content/drive/MyDrive/Colab Notebooks/dataset/validation_dataset"

validation_dataset = load_from_disk(VALIDATION_PATH_DATASET)

validation_dataset = validation_dataset.map(preprocess_function, batched=True)

In [18]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = validation_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 1000,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        remove_unused_columns=False
    ),
)

  trainer = Trainer(


In [19]:
import evaluate

def calculate_perplexity(trainer, eval_dataset):
    trainer.args.per_device_eval_batch_size = 1

    torch.cuda.empty_cache()

    eval_results = trainer.predict(eval_dataset)
    perplexity = torch.exp(torch.tensor(eval_results.metrics["eval_loss"]))
    return perplexity.item()

def calculate_bleu(trainer, eval_dataset):
    trainer.args.per_device_eval_batch_size = 1

    torch.cuda.empty_cache()

    bleu = evaluate.load("bleu")
    predictions = trainer.predict(eval_dataset).predictions
    decoded_preds = trainer.tokenizer.batch_decode(predictions.argmax(-1), skip_special_tokens=True)
    decoded_labels = eval_dataset["output"]
    results = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    return results["bleu"]

In [20]:
perplexity = calculate_perplexity(trainer, validation_dataset)
print(f"Perplexidade: {perplexity}")

bleu_score = calculate_bleu(trainer, validation_dataset)
print(f"BLEU: {bleu_score}")

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`instruction` in this case) have excessive nesting (inputs type `list` where type `int` is expected).