In [None]:
# ====================================================================
# FINE-TUNING LLAMA 3.2-1B - A100 VELOCIDADE M√ÅXIMA
# ====================================================================
# ‚úÖ S√ì RODAR - SEM CHECKPOINTS - OTIMIZADO PARA VELOCIDADE
# ====================================================================

# 1Ô∏è‚É£ INSTALA√á√ÉO R√ÅPIDA
!pip install -q --upgrade pip
!pip install -q accelerate bitsandbytes einops sentencepiece
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -q unsloth xformers trl
!pip install -q --upgrade transformers datasets tqdm

# 2Ô∏è‚É£ IMPORTS
import os
import time
import torch
import gc
from pathlib import Path
from datasets import load_dataset, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

print("‚úÖ Pacotes importados com sucesso!\n")

# 3Ô∏è‚É£ MONTAR DRIVE
from google.colab import drive
drive.mount('/content/drive')

# 4Ô∏è‚É£ CONFIGURAR DIRET√ìRIOS
BASE_DIR = Path("/content/drive/MyDrive/tc_fiap_ft1")
BASE_DIR.mkdir(parents=True, exist_ok=True)

INPUT_FILE = BASE_DIR / "trn.json"
CHUNKS_DIR = BASE_DIR / "chunks"
OUTPUT_DIR = BASE_DIR / "output"
TOKENIZED_DIR = BASE_DIR / "tokenized_datasets"

for d in [CHUNKS_DIR, OUTPUT_DIR, TOKENIZED_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Diret√≥rios configurados:")
print(f"   Base: {BASE_DIR}")
print(f"   Output: {OUTPUT_DIR}\n")

# 5Ô∏è‚É£ VERIFICAR GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"üñ•Ô∏è  GPU: {gpu_name}")
    print(f"üíæ VRAM: {gpu_memory:.0f} GB\n")
else:
    print("‚ö†Ô∏è  Sem GPU detectada!\n")

# 6Ô∏è‚É£ CARREGAR MODELO E TOKENIZER
print("üì¶ Carregando modelo LLaMA 3.2-1B...")
model_name = "unsloth/Llama-3.2-1B-bnb-4bit"

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

print("‚úÖ Modelo carregado!\n")

# 7Ô∏è‚É£ CONFIGURAR LORA (Otimizado para velocidade)
print("üîß Aplicando LoRA...")
lora_config = LoraConfig(
    r=16,  # Rank reduzido para velocidade
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Menos m√≥dulos = mais r√°pido
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print(f"‚úÖ LoRA aplicado!")
model.print_trainable_parameters()
print()

# 8Ô∏è‚É£ CARREGAR OU PROCESSAR DATASETS
train_path = TOKENIZED_DIR / "train"
eval_path = TOKENIZED_DIR / "eval"

if train_path.exists() and eval_path.exists():
    print("‚ôªÔ∏è  Carregando datasets tokenizados do cache...")
    tokenized_train = load_from_disk(str(train_path))
    tokenized_eval = load_from_disk(str(eval_path))
    print(f"‚úÖ Train: {len(tokenized_train):,} | Eval: {len(tokenized_eval):,}\n")

else:
    print("üîÑ Processando datasets pela primeira vez...\n")

    # Carregar JSONs
    json_files = sorted([str(f) for f in CHUNKS_DIR.glob("*.json")])
    if not json_files:
        print(f"‚ùå Nenhum arquivo JSON em {CHUNKS_DIR}")
        print(f"üí° Coloque seus arquivos chunk_*.json em: {CHUNKS_DIR}\n")
        raise FileNotFoundError("Arquivos JSON n√£o encontrados!")

    print(f"üìä Carregando {len(json_files)} arquivo(s)...")
    dataset = load_dataset("json", data_files=json_files, split="train")

    # Split train/eval
    split = dataset.train_test_split(test_size=0.05, seed=42)  # 5% eval (menos = mais r√°pido)
    train_dataset = split["train"]
    eval_dataset = split["test"]

    print(f"üìä Train: {len(train_dataset):,} | Eval: {len(eval_dataset):,}")

    # Tokeniza√ß√£o R√ÅPIDA
    max_length = 768  # 512 tokens = 2x mais r√°pido que 1024

    def preprocess(examples):
        titles = examples.get("title", [""] * len(examples.get("content", [])))
        contents = examples.get("content", [])
        texts = [f"### T√≠tulo: {t}\n### Conte√∫do: {c}" for t, c in zip(titles, contents)]

        tokenized = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors=None
        )
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    print("üîÑ Tokenizando (isso pode levar 10-15 min)...")
    tokenized_train = train_dataset.map(
        preprocess,
        batched=True,
        batch_size=2000,
        num_proc=4,
        remove_columns=train_dataset.column_names,
        desc="Tokenizando treino"
    )

    tokenized_eval = eval_dataset.map(
        preprocess,
        batched=True,
        batch_size=2000,
        num_proc=4,
        remove_columns=eval_dataset.column_names,
        desc="Tokenizando eval"
    )

    # Salvar para pr√≥ximas execu√ß√µes
    print("üíæ Salvando cache...")
    tokenized_train.save_to_disk(str(train_path))
    tokenized_eval.save_to_disk(str(eval_path))
    print(f"‚úÖ Cache salvo em: {TOKENIZED_DIR}\n")

# 9Ô∏è‚É£ DATA COLLATOR
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    return_tensors="pt"
)

# üîü TRAINING ARGUMENTS - M√ÅXIMA VELOCIDADE
print("‚öôÔ∏è  Configurando treinamento...\n")
training_args = TrainingArguments(
    output_dir="/tmp/training_output",  # /tmp = mais r√°pido que Drive

    # BATCH - Otimizado para A100 sem estourar RAM
    per_device_train_batch_size=24,  # M√°ximo seguro para A100
    per_device_eval_batch_size=24,
    gradient_accumulation_steps=1,  # Sem acumula√ß√£o = mais r√°pido

    # LEARNING
    learning_rate=5e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,

    # PRECIS√ÉO
    bf16=True,
    bf16_full_eval=True,

    # LOGGING M√çNIMO
    logging_steps=100,
    logging_first_step=True,

    # SEM SALVAMENTO DURANTE TREINO
    save_strategy="no",
    save_steps=999999,

    # AVALIA√á√ÉO M√çNIMA
    eval_strategy="steps",
    eval_steps=500,  # Apenas 4 avalia√ß√µes

    # STEPS
    num_train_epochs=1,
    max_steps=2000,

    # OTIMIZA√á√ïES
    optim="adamw_torch_fused",
    gradient_checkpointing=True,

    # DATALOADER R√ÅPIDO
    dataloader_num_workers=2,  # 2 workers = balan√ßo velocidade/RAM
    dataloader_pin_memory=True,

    # SEM EXTRAS
    report_to="none",
    load_best_model_at_end=False,
    disable_tqdm=False,
)

# 1Ô∏è‚É£1Ô∏è‚É£ CRIAR TRAINER
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator
)

print("‚úÖ Trainer configurado!")
print(f"üéØ Total steps: {training_args.max_steps}")
print(f"‚è±Ô∏è  Tempo estimado: ~{training_args.max_steps/60:.0f} minutos\n")

# 1Ô∏è‚É£2Ô∏è‚É£ TREINAR
print("="*100)
print("üî• INICIANDO TREINAMENTO".center(100))
print("="*100)
print(f"\n‚öôÔ∏è  Configura√ß√£o:")
print(f"   ‚Ä¢ Batch Size: {training_args.per_device_train_batch_size}")
print(f"   ‚Ä¢ Max Length: 768 tokens")
print(f"   ‚Ä¢ Learning Rate: {training_args.learning_rate}")
print(f"   ‚Ä¢ Steps: {training_args.max_steps}")
print(f"   ‚Ä¢ Eval: A cada 500 steps")
print(f"   ‚Ä¢ Checkpoints: Desabilitados")
print(f"   ‚Ä¢ Precis√£o: BFloat16")
print(f"\n‚ö° Velocidade esperada: ~60 steps/min")
print(f"‚è±Ô∏è  ETA: ~{training_args.max_steps/60:.0f} minutos")
print("="*100 + "\n")

start_time = time.time()

try:
    trainer.train()
    print("\n‚úÖ Treinamento conclu√≠do!\n")

except KeyboardInterrupt:
    print("\n‚ö†Ô∏è  Treinamento interrompido pelo usu√°rio!\n")

except Exception as e:
    print(f"\n‚ùå Erro: {e}\n")
    raise

finally:
    training_time = (time.time() - start_time) / 60
    print(f"‚è±Ô∏è  Tempo de treinamento: {training_time:.1f} minutos")

    # Limpar mem√≥ria
    gc.collect()
    torch.cuda.empty_cache()

# 1Ô∏è‚É£3Ô∏è‚É£ SALVAR MODELO FINAL
print("\n" + "="*100)
print("üíæ SALVANDO MODELO FINAL".center(100))
print("="*100 + "\n")

final_model_path = OUTPUT_DIR / "final_model"

try:
    model.save_pretrained(str(final_model_path))
    tokenizer.save_pretrained(str(final_model_path))

    print(f"‚úÖ Modelo salvo em: {final_model_path}\n")

    # Mostrar arquivos
    print("üìÅ Arquivos gerados:")
    for file in sorted(final_model_path.iterdir()):
        size = file.stat().st_size / (1024*1024)
        print(f"   ‚Ä¢ {file.name}: {size:.1f} MB")

except Exception as e:
    print(f"‚ùå Erro ao salvar: {e}\n")
    raise

# 1Ô∏è‚É£4Ô∏è‚É£ ESTAT√çSTICAS FINAIS
print("\n" + "="*100)
print("üìä RESUMO DO TREINAMENTO".center(100))
print("="*100 + "\n")

if hasattr(trainer.state, 'log_history') and trainer.state.log_history:
    logs = trainer.state.log_history
    train_losses = [log['loss'] for log in logs if 'loss' in log]
    eval_losses = [log['eval_loss'] for log in logs if 'eval_loss' in log]

    if train_losses:
        print("‚ïî" + "‚ïê"*98 + "‚ïó")
        print("‚ïë" + " üìà M√âTRICAS FINAIS ".center(98) + "‚ïë")
        print("‚ï†" + "‚ïê"*98 + "‚ï£")
        print(f"‚ïë  üéØ Steps: {trainer.state.global_step:,}/{training_args.max_steps:,}".ljust(99) + "‚ïë")
        print(f"‚ïë  ‚è±Ô∏è  Tempo: {training_time:.1f} minutos".ljust(99) + "‚ïë")
        print(f"‚ïë  ‚ö° Velocidade: {trainer.state.global_step/training_time:.1f} steps/min".ljust(99) + "‚ïë")
        print("‚ïë" + " "*98 + "‚ïë")
        print(f"‚ïë  üìâ Loss Inicial: {train_losses[0]:.4f}".ljust(99) + "‚ïë")
        print(f"‚ïë  üìâ Loss Final: {train_losses[-1]:.4f}".ljust(99) + "‚ïë")

        if eval_losses:
            print(f"‚ïë  üìä Melhor Eval Loss: {min(eval_losses):.4f}".ljust(99) + "‚ïë")

        improvement = ((train_losses[0] - train_losses[-1]) / train_losses[0]) * 100
        print(f"‚ïë  üìà Melhoria: {improvement:.1f}%".ljust(99) + "‚ïë")
        print("‚ïö" + "‚ïê"*98 + "‚ïù")

print("\nüéâ TREINAMENTO CONCLU√çDO COM SUCESSO!")
print(f"üíæ Modelo pronto em: {final_model_path}")
print(f"üìä Datasets tokenizados em: {TOKENIZED_DIR}")
print("\n‚úÖ Pr√≥xima execu√ß√£o ser√° mais r√°pida (usa cache)!")
print("="*100 + "\n")

[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
unsloth 2025.10.1 requires transformers!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2,>=4.51.3, but you have transformers 4.57.0 which is incompatible.
unsloth-zoo 2025.10.1 requires transformers!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2,>=4.51.3, but you have transformers 4.57.0 which is incompatible.[0m[31m
[0m‚úÖ Pacotes importados com sucesso!

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üìÅ Diret√≥rios configurados:
   Base: /content/drive/MyDrive/tc_fiap_ft1
   Output: 

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


‚úÖ Modelo carregado!

üîß Aplicando LoRA...
‚úÖ LoRA aplicado!
trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750

‚ôªÔ∏è  Carregando datasets tokenizados do cache...


Loading dataset from disk:   0%|          | 0/33 [00:00<?, ?it/s]

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


‚úÖ Train: 1,229,886 | Eval: 136,655

‚öôÔ∏è  Configurando treinamento...

‚úÖ Trainer configurado!
üéØ Total steps: 2000
‚è±Ô∏è  Tempo estimado: ~33 minutos

                                      üî• INICIANDO TREINAMENTO                                       

‚öôÔ∏è  Configura√ß√£o:
   ‚Ä¢ Batch Size: 24
   ‚Ä¢ Max Length: 768 tokens
   ‚Ä¢ Learning Rate: 0.0005
   ‚Ä¢ Steps: 2000
   ‚Ä¢ Eval: A cada 500 steps
   ‚Ä¢ Checkpoints: Desabilitados
   ‚Ä¢ Precis√£o: BFloat16

‚ö° Velocidade esperada: ~60 steps/min
‚è±Ô∏è  ETA: ~33 minutos



Step,Training Loss,Validation Loss
