In [None]:
pip install -r requirements.txt

In [None]:
import argparse
import json
import gc
from pathlib import Path
from datasets import Dataset, load_dataset as hf_load
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training,
)
from trl import SFTTrainer
import torch
from huggingface_hub import login

import os, json
from google.colab import drive
drive.mount("/content/drive")

login("substituir pelo token de acesso do huggingfaces")

# Caminhos no Drive
JSONL_PATH = "/content/drive/MyDrive/train_data.jsonl"
OUT_DIR    = "/content/drive/MyDrive/llama_finetuned"

In [None]:
def load_dataset(train_data_path: str) -> Dataset:
    return hf_load("json", data_files=train_data_path, split="train")

In [None]:
def tokenize_dataset(dataset: Dataset, tokenizer, max_length: int = 512) -> Dataset:
    def tokenize_fn(examples):
        out = tokenizer(
            examples["text"],
            truncation=True,
            max_length=max_length,
            padding="max_length",
        )
        out["labels"] = out["input_ids"].copy()
        return out

    return dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

In [None]:
def create_model_and_tokenizer():
    model_name = "meta-llama/Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.model_max_length = 512

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )
    model.config.use_memory_efficient_attention = True
    model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        bias="none",
    )
    model = get_peft_model(model, lora_config)
    return model, tokenizer

In [None]:
def run_train(train_data_path: str, output_dir: str, num_epochs: int):
    gc.collect()
    torch.cuda.empty_cache()

    dataset = load_dataset(train_data_path)
    model, tokenizer = create_model_and_tokenizer()
    tokenized = tokenize_dataset(dataset, tokenizer, max_length=512)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        logging_steps=10,
        save_strategy="epoch",
        save_total_limit=2,
        max_steps=500,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=True if torch.cuda.is_available() else False,
        gradient_checkpointing=True,
        optim="paged_adamw_8bit",
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        report_to="none",
        remove_unused_columns=False,
    )

    train_dataset = dataset["train"].map(lambda x: {"text": format(x)})

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        args=training_args,
    )

    trainer.train()
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"[OK] Adapter salvo em: {output_dir}")


In [None]:
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, required=True, help="Caminho do JSONL com campo 'text'")
    parser.add_argument("--outdir", type=str, default="outputs/llama_finetuned")
    parser.add_argument("--epochs", type=int, default=1)
    args = parser.parse_args()

    Path(args.outdir).mkdir(parents=True, exist_ok=True)
    run_train(args.data, args.outdir, args.epochs)

In [None]:
run_train(
    train_data_path=JSONL_PATH,
    output_dir=OUT_DIR,
    num_epochs=1,
)