#Instalação de dependências

In [None]:
!pip install -U transformers datasets rouge-score nltk sentencepiece kaggle kagglehub evaluate --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m67.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import os
import torch
import numpy as np
import evaluate
import kagglehub
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          TrainingArguments, Trainer)

#Configurar acesso ao Kaggle

In [None]:
path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/newspaper-text-summarization-cnn-dailymail


#Carregar e preparar os dados

In [None]:
train_df = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv")
val_df = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv")
test_df = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv")

In [None]:
train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)

# Pré-processamento e tokenização

In [None]:
model_ckpt       = "t5-small"          # ou "facebook/bart-base"
tokenizer        = AutoTokenizer.from_pretrained(model_ckpt)
max_input_len    = 512
max_target_len   = 60

def preprocess(batch):
    inputs = tokenizer(
        ["summarize: " + text for text in batch["article"]],
        max_length=512,
        truncation=True,
        padding="max_length"  # ← AQUI!
    )
    targets = tokenizer(
        batch["highlights"],
        max_length=60,
        truncation=True,
        padding="max_length"  # ← AQUI!
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_train = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
tokenized_val   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)
tokenized_test  = test_ds.map(preprocess, batched=True, remove_columns=test_ds.column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

#Treinamento (Fine-Tuning)

In [None]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train.select(range(10000)),
    eval_dataset=tokenized_val.select(range(1000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


#Avaliação

In [None]:
metric = evaluate.load("rouge")

def evaluate_predictions():
    model.eval()

    # Pegando o primeiro exemplo da base de teste original
    article = test_ds[0]["article"]
    reference = test_ds[0]["highlights"]

    # Tokenizando entrada
    inputs = tokenizer("summarize: " + article,
                       return_tensors="pt",
                       truncation=True,
                       max_length=512).input_ids.to(model.device)

    # Geração de resumo
    output = model.generate(inputs, max_length=60)
    generated_summary = tokenizer.decode(output[0], skip_special_tokens=True)

    print("Notícia original:\n", article[:500], "...\n")  # Truncando para visualização
    print("Resumo gerado:\n", generated_summary)
    print("Resumo referência:\n", reference)

evaluate_predictions()