In [None]:
import shutil
import os

# Lista de pastas criadas nos passos anteriores
pastas_lixo = [
    "./debug_out",
    "./debug_low_ram",
    "./gemma_mwe_finetuned",
    "./gemma_a100_finetuned",
    "./sample_data" # Pasta padrão do Colab
]

total_liberado = 0

for pasta in pastas_lixo:
    if os.path.exists(pasta):
        try:
            shutil.rmtree(pasta)
            print(f"Apagada: {pasta}")
        except Exception as e:
            print(f"Erro ao apagar {pasta}: {e}")
    else:
        print(f"Já não existia: {pasta}")

print("\nLimpeza concluída! Pode rodar o treino.")

In [None]:
from huggingface_hub import login
login(token="")

In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from gensim.models.phrases import Phrases, Phraser
import os
import gc

# --- 1. PREPARAÇÃO DO AMBIENTE ---
torch.cuda.empty_cache()
gc.collect()
os.environ["WANDB_DISABLED"] = "true"

print("INICIANDO EXPERIMENTO...")

# --- 2. DADOS E SEPARAÇÃO ---
print("Carregando e Dividindo Dataset...")
dataset = load_dataset("TucanoBR/GigaVerbo-Text-Filter", name="default", split="train")

# Seleciona 40k amostras
full_dataset = dataset.select(range(40000))

# Divisão 90% Treino / 10% Teste
dataset_split = full_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

print(f"Treino: {len(train_dataset)} amostras | Teste: {len(eval_dataset)} amostras")

# --- 3. TREINAMENTO DO MWE ---
print("Treinando detector de MWE...")
stream = (row['text'].lower().split() for row in train_dataset)
phrases = Phrases(stream, min_count=10, threshold=15.0)
bigram_phraser = Phraser(phrases)

# Filtro Top 20k
all_mwe = list(phrases.export_phrases().items())
all_mwe_sorted = sorted(all_mwe, key=lambda x: x[1], reverse=True)
new_tokens = [mwe[0].decode('utf-8') if isinstance(mwe[0], bytes) else mwe[0] for mwe in all_mwe_sorted[:20000]]
print(f"Vocabulário Novo: {len(new_tokens)} tokens.")

# --- 4. PRÉ-PROCESSAMENTO (MAP) ---
print("Aplicando MWE nos conjuntos...")

def apply_mwe_to_row(example):
    tokens = example['text'].lower().split()
    new_tokens_list = bigram_phraser[tokens]
    return {"text": " ".join(new_tokens_list)}

train_dataset = train_dataset.map(apply_mwe_to_row, num_proc=4)
eval_dataset = eval_dataset.map(apply_mwe_to_row, num_proc=4)

# --- 5. MODELO & INJEÇÃO ---
print("Carregando Gemma 2B (A100/bf16)...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

MODEL_ID = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, quantization_config=bnb_config, device_map="auto"
)

print(f"Injetando Vocabulário...")
tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer))

# --- 6. LORA ---
peft_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    modules_to_save=["embed_tokens", "lm_head"]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.gradient_checkpointing_enable()

# --- 7. TREINAMENTO ---
print("Configurando Treinador...")

sft_config = SFTConfig(
    output_dir="./gemma_robust_final",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    max_steps=500,
    load_best_model_at_end=True,
    bf16=True,
    optim="paged_adamw_8bit",
    dataset_text_field="text"
)

# Injeção Manual
sft_config.max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    args=sft_config,
    processing_class=tokenizer,
)

print("Rodando Treino Final (Monitore o 'Eval Loss')...")
trainer.train()

# --- 8. SALVAR ---
print("Salvando o Modelo Campeão...")
FINAL_PATH = "gemma-2b-robust-final"
trainer.model.save_pretrained(FINAL_PATH)
tokenizer.save_pretrained(FINAL_PATH)

print(f"\nEXPERIMENTO CONCLUÍDO! Modelo salvo em: {FINAL_PATH}")

In [None]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

print("CARREGANDO O MODELO CAMPEÃO...")

# --- CAMINHOS ---
MODEL_ID = "google/gemma-2b-it"
# Atenção: Usando o caminho do treino robusto agora
ADAPTER_PATH = "gemma-2b-robust-final"

# --- CARREGAR ---
# 1. Tokenizer (com os 20k tokens novos)
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_PATH)

# 2. Modelo Base
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto"
)

# 3. Resize e Acoplagem do LoRA
model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(model, ADAPTER_PATH)

# --- TESTE DE GERAÇÃO ---
print("\nENTREVISTANDO O MODELO...")

def testar_modelo(prompt):
    # O prompt entra puro, o tokenizer já sabe que 'taxa_de_juros' é um token só
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            do_sample=True,
            temperature=0.6, # Temperatura baixa para ser mais racional
            top_p=0.9
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

prompts_finais = [
    "A taxa_de_juros elevada pode causar",
    "O governo_federal deve investir em",
    "A inteligencia_artificial mudou o mundo porque",
    "O rio_de_janeiro enfrenta problemas com",
    "Durante o fim_de_semana as pessoas gostam de"
]

for p in prompts_finais:
    print(f"{p}...")
    res = testar_modelo(p)
    print(f"{res}")
    print("-" * 40)