<a href="https://colab.research.google.com/github/renato-penna/fiap-tech-challenge-fase03/blob/main/fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install transformers datasets

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import json
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

DATA_PATH = "/content/drive/MyDrive/Fiap/trnTreaded.json"
OUTPUT_PATH_DATASET = "/content/drive/MyDrive/Fiap/formatted_trn.json"
max_seq_length = 2048
dtype = None
load_in_4bit = True
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
]


In [None]:
def format_dataset_into_model_input(data):

    prompt = data.get("prompt", "")
    completion = data.get("completion", "")

    instruction = "Generate a description for the following item."

    try:
        input_text = prompt.split("Question:")[1].split("\nAnswer:")[0].strip()
    except IndexError:

        input_text = ""

    response = completion.strip()

    return instruction, input_text, response

# Inicializando as listas para armazenar os dados
instructions = []
inputs = []
outputs = []

# Load the dataset
dataset = load_dataset('json', data_files=DATA_PATH)

# Processando o dataset
for prompt in dataset['train']['input']:
    instruction, input_text, response = format_dataset_into_model_input({"prompt": prompt})
    instructions.append(instruction)
    inputs.append(input_text)
    outputs.append(response)

# Criando o dicionário final
formatted_data = {
    "instruction": instructions,
    "input": inputs,
    "output": outputs
}

# Salvando o resultado em um arquivo JSON
with open(OUTPUT_PATH_DATASET, 'w') as output_file:
    json.dump(formatted_data, output_file, indent=4)

print(f"Dataset salvo em {OUTPUT_PATH_DATASET}")

In [None]:
format_dataset_into_model_input(data)


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",

    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)