# Tech Challenge 3 — Fine‑Tuning FLAN‑T5
**Autor:** Luís Felipe Alves — **RM: 363734**  
**Entrega individual — FIAP | Pós IA para Devs**  
**Última atualização:** 2025-10-02 04:40:31

Este notebook realiza o pipeline completo: dados → baseline → **fine‑tuning** → **ROUGE** → `responder()`.

## 0) Instalação e checagens

In [None]:
!pip -q install -U "transformers==4.44.2" "datasets==2.20.0" "sentencepiece==0.2.0" \
                 "evaluate==0.4.2" "rouge-score==0.1.2" fsspec gcsfs

import sys, torch, random, numpy as np
print("Python:", sys.version)
print("Torch:", torch.__version__)
print("CUDA disponível:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

seed=42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

## 1) Configuração (MODO RÁPIDO/COMPLETO)

In [None]:
from pathlib import Path
import torch

MODEL_NAME = "google/flan-t5-small"   # pode trocar para 'flan-t5-base'
MODO = "RAPIDO"   # "RAPIDO" ou "COMPLETO"

if MODO == "RAPIDO":
    SAMPLE_SIZE = 9000
    EPOCHS = 1
else:
    SAMPLE_SIZE = 30000
    EPOCHS = 2

BATCH_SIZE = 8
GRAD_ACCUM = 4
LR = 2e-4
MAX_SOURCE_LEN = 96
MAX_TARGET_LEN = 192

BASE_DIR = Path("/content/drive/MyDrive/tc3_flan_t5")
RAW_DIR  = BASE_DIR/"raw"; RAW_DIR.mkdir(parents=True, exist_ok=True)
TRN_JSON = RAW_DIR/"trn.json"  # ou .gz

USE_BF16 = torch.cuda.is_available() and ("a100" in torch.cuda.get_device_name(0).lower())
print("MODO:", MODO, "| SAMPLE_SIZE:", SAMPLE_SIZE, "| EPOCHS:", EPOCHS, "| BF16:", USE_BF16)

## 2) Montar Drive e confirmar dados

In [None]:
try:
    from google.colab import drive
    drive.mount("/content/drive")
except Exception as e:
    print("Drive já montado ou indisponível:", e)

print("Existe TRN_JSON?", TRN_JSON.exists())
if not TRN_JSON.exists():
    print("Faça upload de trn.json (ou trn.json.gz) para", TRN_JSON)
    try:
        from google.colab import files
        up = files.upload()
        import shutil
        for name in up.keys():
            shutil.move(name, TRN_JSON.as_posix())
        print("Upload concluído:", TRN_JSON)
    except Exception as e:
        print("Envie manualmente o arquivo para a pasta indicada.")

## 3) Carregar amostra do dataset (title → content)

In [None]:
import json, gzip, re

def is_gz(p): 
    try:
        with open(p,"rb") as f: return f.read(2)==b"\x1f\x8b"
    except: return str(p).endswith(".gz")

def iter_lines(p):
    if is_gz(p): f = gzip.open(p,"rt",encoding="utf-8",errors="ignore")
    else: f = open(p,"r",encoding="utf-8",errors="ignore")
    with f:
        for line in f:
            line=line.strip()
            if line: yield line.rstrip(",")

def norm(s): return re.sub(r"\s+"," ", (s or "").strip())

pairs=[]
for i, line in enumerate(iter_lines(TRN_JSON)):
    if i>=SAMPLE_SIZE: break
    try:
        obj=json.loads(line)
        t,c=norm(obj.get("title","")), norm(obj.get("content",""))
        if t and c: pairs.append({"title":t, "content":c})
    except: pass

print("Amostra carregada:", len(pairs))
print("Exemplo:", pairs[0] if pairs else "sem dados")

## 4) Tokenização (Seq2Seq)

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def make_input(title):
    return f"Generate a concise product description.\nTitle: {title}"

raw_ds = Dataset.from_list([{"input": make_input(p["title"]), "target": p["content"]} for p in pairs])
raw_ds = raw_ds.train_test_split(test_size=0.06, seed=42)
train_raw, val_raw = raw_ds["train"], raw_ds["test"]

def preprocess(examples):
    model_inputs = tokenizer(examples["input"], max_length=MAX_SOURCE_LEN, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target"], max_length=MAX_TARGET_LEN, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_ds = train_raw.map(preprocess, batched=True, remove_columns=train_raw.column_names)
val_ds   = val_raw.map(preprocess,   batched=True, remove_columns=val_raw.column_names)
print(train_ds, "\n", val_ds)

## 5) Baseline (modelo sem ajuste)

In [None]:
from transformers import AutoModelForSeq2SeqLM
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device).eval()

def gerar_base(titles, max_new_tokens=64):
    inp = tokenizer([make_input(t) for t in titles], return_tensors="pt",
                    padding=True, truncation=True).to(device)
    with torch.no_grad():
        out = base_model.generate(**inp, max_new_tokens=max_new_tokens)
    return tokenizer.batch_decode(out, skip_special_tokens=True)

sample_titles = [p["title"] for p in pairs[:3]]
print("Baseline sample:", gerar_base(sample_titles)[:1])

## 6) Fine‑tuning (Seq2SeqTrainer)

In [None]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL_NAME)

args = Seq2SeqTrainingArguments(
    output_dir=str(BASE_DIR/"models"/"flan_t5_tc3"),
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    bf16=USE_BF16, fp16=not USE_BF16,
    report_to="none",
)

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model(str(BASE_DIR/"models"/"flan_t5_tc3"))
tokenizer.save_pretrained(str(BASE_DIR/"models"/"flan_t5_tc3"))

## 7) Avaliação — ROUGE

In [None]:
import evaluate, numpy as np

rouge = evaluate.load("rouge")

pred = trainer.predict(val_ds, max_new_tokens=MAX_TARGET_LEN)
pred_ids = pred.predictions
preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

labels = pred.label_ids
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

results = rouge.compute(predictions=preds, references=refs, use_stemmer=True)
print({k: round(v, 4) for k, v in results.items()})

## 8) Função `responder()` e exemplos

In [None]:
model.eval()

def responder(title: str, max_new_tokens: int = 128) -> str:
    inp = tokenizer([make_input(title)], return_tensors="pt",
                    padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        out = model.generate(**inp, max_new_tokens=max_new_tokens, num_beams=4,
                             no_repeat_ngram_size=3)
    return tokenizer.decode(out[0], skip_special_tokens=True)

for i, t in enumerate(sample_titles, 1):
    print(f"\n#{i} TITLE:", t)
    print("ANTES :", gerar_base([t])[0][:300])
    print("DEPOIS:", responder(t)[:300])

## 9) Carregar modelo salvo (para gravar sem retreinar)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

LOAD_DIR = str(BASE_DIR/"models"/"flan_t5_tc3")
tok = AutoTokenizer.from_pretrained(LOAD_DIR)
mdl = AutoModelForSeq2SeqLM.from_pretrained(LOAD_DIR).to(device).eval()

def responder_carregado(title: str, max_new_tokens: int = 128) -> str:
    inp = tok([make_input(title)], return_tensors="pt",
              padding=True, truncation=True).to(mdl.device)
    with torch.no_grad():
        out = mdl.generate(**inp, max_new_tokens=max_new_tokens, num_beams=4,
                           no_repeat_ngram_size=3)
    return tok.decode(out[0], skip_special_tokens=True)

print(responder_carregado(sample_titles[0])[:300])