In [None]:
# ====================================================================
# FINE-TUNING LLAMA 3.2-1B - A100 VELOCIDADE MÁXIMA
# ====================================================================
# ✅ SÓ RODAR - SEM CHECKPOINTS - OTIMIZADO PARA VELOCIDADE
# ====================================================================

# 1️⃣ INSTALAÇÃO RÁPIDA
!pip install -q --upgrade pip
!pip install -q accelerate bitsandbytes einops sentencepiece
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -q unsloth xformers trl
!pip install -q --upgrade transformers datasets tqdm

# 2️⃣ IMPORTS
import os
import time
import torch
import gc
from pathlib import Path
from datasets import load_dataset, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

print("✅ Pacotes importados com sucesso!\n")

# 3️⃣ MONTAR DRIVE
from google.colab import drive
drive.mount('/content/drive')

# 4️⃣ CONFIGURAR DIRETÓRIOS
BASE_DIR = Path("/content/drive/MyDrive/tc_fiap_ft1")
BASE_DIR.mkdir(parents=True, exist_ok=True)

INPUT_FILE = BASE_DIR / "trn.json"
CHUNKS_DIR = BASE_DIR / "chunks"
OUTPUT_DIR = BASE_DIR / "output"
TOKENIZED_DIR = BASE_DIR / "tokenized_datasets"

for d in [CHUNKS_DIR, OUTPUT_DIR, TOKENIZED_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print(f"📁 Diretórios configurados:")
print(f"   Base: {BASE_DIR}")
print(f"   Output: {OUTPUT_DIR}\n")

# 5️⃣ VERIFICAR GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"🖥️  GPU: {gpu_name}")
    print(f"💾 VRAM: {gpu_memory:.0f} GB\n")
else:
    print("⚠️  Sem GPU detectada!\n")

# 6️⃣ CARREGAR MODELO E TOKENIZER
print("📦 Carregando modelo LLaMA 3.2-1B...")
model_name = "unsloth/Llama-3.2-1B-bnb-4bit"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

print("✅ Modelo carregado!\n")

# 7️⃣ CONFIGURAR LORA (Otimizado para velocidade)
print("🔧 Aplicando LoRA...")
lora_config = LoraConfig(
    r=16,  # Rank reduzido para velocidade
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Menos módulos = mais rápido
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print(f"✅ LoRA aplicado!")
model.print_trainable_parameters()
print()

# 8️⃣ CARREGAR OU PROCESSAR DATASETS
train_path = TOKENIZED_DIR / "train"
eval_path = TOKENIZED_DIR / "eval"

if train_path.exists() and eval_path.exists():
    print("♻️  Carregando datasets tokenizados do cache...")
    tokenized_train = load_from_disk(str(train_path))
    tokenized_eval = load_from_disk(str(eval_path))
    print(f"✅ Train: {len(tokenized_train):,} | Eval: {len(tokenized_eval):,}\n")

else:
    print("🔄 Processando datasets pela primeira vez...\n")

    # Carregar JSONs
    json_files = sorted([str(f) for f in CHUNKS_DIR.glob("*.json")])
    if not json_files:
        print(f"❌ Nenhum arquivo JSON em {CHUNKS_DIR}")
        print(f"💡 Coloque seus arquivos chunk_*.json em: {CHUNKS_DIR}\n")
        raise FileNotFoundError("Arquivos JSON não encontrados!")

    print(f"📊 Carregando {len(json_files)} arquivo(s)...")
    dataset = load_dataset("json", data_files=json_files, split="train")

    # Split train/eval
    split = dataset.train_test_split(test_size=0.05, seed=42)  # 5% eval (menos = mais rápido)
    train_dataset = split["train"]
    eval_dataset = split["test"]

    print(f"📊 Train: {len(train_dataset):,} | Eval: {len(eval_dataset):,}")

    # Tokenização RÁPIDA
    max_length = 768  # 512 tokens = 2x mais rápido que 1024

    def preprocess(examples):
        titles = examples.get("title", [""] * len(examples.get("content", [])))
        contents = examples.get("content", [])
        texts = [f"### Título: {t}\n### Conteúdo: {c}" for t, c in zip(titles, contents)]

        tokenized = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors=None
        )
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    print("🔄 Tokenizando (isso pode levar 10-15 min)...")
    tokenized_train = train_dataset.map(
        preprocess,
        batched=True,
        batch_size=2000,
        num_proc=4,
        remove_columns=train_dataset.column_names,
        desc="Tokenizando treino"
    )

    tokenized_eval = eval_dataset.map(
        preprocess,
        batched=True,
        batch_size=2000,
        num_proc=4,
        remove_columns=eval_dataset.column_names,
        desc="Tokenizando eval"
    )

    # Salvar para próximas execuções
    print("💾 Salvando cache...")
    tokenized_train.save_to_disk(str(train_path))
    tokenized_eval.save_to_disk(str(eval_path))
    print(f"✅ Cache salvo em: {TOKENIZED_DIR}\n")

# 9️⃣ DATA COLLATOR
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    return_tensors="pt"
)

# 🔟 TRAINING ARGUMENTS - MÁXIMA VELOCIDADE
print("⚙️  Configurando treinamento...\n")
training_args = TrainingArguments(
    output_dir="/tmp/training_output",  # /tmp = mais rápido que Drive

    # BATCH - Otimizado para A100 sem estourar RAM
    per_device_train_batch_size=24,  # Máximo seguro para A100
    per_device_eval_batch_size=24,
    gradient_accumulation_steps=1,  # Sem acumulação = mais rápido

    # LEARNING
    learning_rate=5e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,

    # PRECISÃO
    bf16=True,
    bf16_full_eval=True,

    # LOGGING MÍNIMO
    logging_steps=100,
    logging_first_step=True,

    # SEM SALVAMENTO DURANTE TREINO
    save_strategy="no",
    save_steps=999999,

    # AVALIAÇÃO MÍNIMA
    eval_strategy="steps",
    eval_steps=500,  # Apenas 4 avaliações

    # STEPS
    num_train_epochs=1,
    max_steps=2000,

    # OTIMIZAÇÕES
    optim="adamw_torch_fused",
    gradient_checkpointing=True,

    # DATALOADER RÁPIDO
    dataloader_num_workers=2,  # 2 workers = balanço velocidade/RAM
    dataloader_pin_memory=True,

    # SEM EXTRAS
    report_to="none",
    load_best_model_at_end=False,
    disable_tqdm=False,
)

# 1️⃣1️⃣ CRIAR TRAINER
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator
)

print("✅ Trainer configurado!")
print(f"🎯 Total steps: {training_args.max_steps}")
print(f"⏱️  Tempo estimado: ~{training_args.max_steps/60:.0f} minutos\n")

# 1️⃣2️⃣ TREINAR
print("="*100)
print("🔥 INICIANDO TREINAMENTO".center(100))
print("="*100)
print(f"\n⚙️  Configuração:")
print(f"   • Batch Size: {training_args.per_device_train_batch_size}")
print(f"   • Max Length: 768 tokens")
print(f"   • Learning Rate: {training_args.learning_rate}")
print(f"   • Steps: {training_args.max_steps}")
print(f"   • Eval: A cada 500 steps")
print(f"   • Checkpoints: Desabilitados")
print(f"   • Precisão: BFloat16")
print(f"\n⚡ Velocidade esperada: ~60 steps/min")
print(f"⏱️  ETA: ~{training_args.max_steps/60:.0f} minutos")
print("="*100 + "\n")

start_time = time.time()

try:
    trainer.train()
    print("\n✅ Treinamento concluído!\n")

except KeyboardInterrupt:
    print("\n⚠️  Treinamento interrompido pelo usuário!\n")

except Exception as e:
    print(f"\n❌ Erro: {e}\n")
    raise

finally:
    training_time = (time.time() - start_time) / 60
    print(f"⏱️  Tempo de treinamento: {training_time:.1f} minutos")

    # Limpar memória
    gc.collect()
    torch.cuda.empty_cache()

# 1️⃣3️⃣ SALVAR MODELO FINAL
print("\n" + "="*100)
print("💾 SALVANDO MODELO FINAL".center(100))
print("="*100 + "\n")

final_model_path = OUTPUT_DIR / "final_model"

try:
    model.save_pretrained(str(final_model_path))
    tokenizer.save_pretrained(str(final_model_path))

    print(f"✅ Modelo salvo em: {final_model_path}\n")

    # Mostrar arquivos
    print("📁 Arquivos gerados:")
    for file in sorted(final_model_path.iterdir()):
        size = file.stat().st_size / (1024*1024)
        print(f"   • {file.name}: {size:.1f} MB")

except Exception as e:
    print(f"❌ Erro ao salvar: {e}\n")
    raise

# 1️⃣4️⃣ ESTATÍSTICAS FINAIS
print("\n" + "="*100)
print("📊 RESUMO DO TREINAMENTO".center(100))
print("="*100 + "\n")

if hasattr(trainer.state, 'log_history') and trainer.state.log_history:
    logs = trainer.state.log_history
    train_losses = [log['loss'] for log in logs if 'loss' in log]
    eval_losses = [log['eval_loss'] for log in logs if 'eval_loss' in log]

    if train_losses:
        print("╔" + "═"*98 + "╗")
        print("║" + " 📈 MÉTRICAS FINAIS ".center(98) + "║")
        print("╠" + "═"*98 + "╣")
        print(f"║  🎯 Steps: {trainer.state.global_step:,}/{training_args.max_steps:,}".ljust(99) + "║")
        print(f"║  ⏱️  Tempo: {training_time:.1f} minutos".ljust(99) + "║")
        print(f"║  ⚡ Velocidade: {trainer.state.global_step/training_time:.1f} steps/min".ljust(99) + "║")
        print("║" + " "*98 + "║")
        print(f"║  📉 Loss Inicial: {train_losses[0]:.4f}".ljust(99) + "║")
        print(f"║  📉 Loss Final: {train_losses[-1]:.4f}".ljust(99) + "║")

        if eval_losses:
            print(f"║  📊 Melhor Eval Loss: {min(eval_losses):.4f}".ljust(99) + "║")

        improvement = ((train_losses[0] - train_losses[-1]) / train_losses[0]) * 100
        print(f"║  📈 Melhoria: {improvement:.1f}%".ljust(99) + "║")
        print("╚" + "═"*98 + "╝")

print("\n🎉 TREINAMENTO CONCLUÍDO COM SUCESSO!")
print(f"💾 Modelo pronto em: {final_model_path}")
print(f"📊 Datasets tokenizados em: {TOKENIZED_DIR}")
print("\n✅ Próxima execução será mais rápida (usa cache)!")
print("="*100 + "\n")

[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
unsloth 2025.10.1 requires transformers!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2,>=4.51.3, but you have transformers 4.57.0 which is incompatible.
unsloth-zoo 2025.10.1 requires transformers!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2,>=4.51.3, but you have transformers 4.57.0 which is incompatible.[0m[31m
[0m✅ Pacotes importados com sucesso!

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📁 Diretórios configurados:
   Base: /content/drive/MyDrive/tc_fiap_ft1
   Output: /conte

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


✅ Modelo carregado!

🔧 Aplicando LoRA...
✅ LoRA aplicado!
trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750

♻️  Carregando datasets tokenizados do cache...


Loading dataset from disk:   0%|          | 0/33 [00:00<?, ?it/s]

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


✅ Train: 1,229,886 | Eval: 136,655

⚙️  Configurando treinamento...

✅ Trainer configurado!
🎯 Total steps: 2000
⏱️  Tempo estimado: ~33 minutos

                                      🔥 INICIANDO TREINAMENTO                                       

⚙️  Configuração:
   • Batch Size: 24
   • Max Length: 768 tokens
   • Learning Rate: 0.0005
   • Steps: 2000
   • Eval: A cada 500 steps
   • Checkpoints: Desabilitados
   • Precisão: BFloat16

⚡ Velocidade esperada: ~60 steps/min
⏱️  ETA: ~33 minutos



Step,Training Loss,Validation Loss
