#Instalação de dependências

In [None]:
!pip install -U transformers datasets rouge-score nltk sentencepiece kaggle kagglehub evaluate torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --quiet

In [None]:

import pandas as pd
import os
import torch
import numpy as np
import evaluate
import kagglehub
from datasets import Dataset, load_from_disk, load_dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          TrainingArguments, Trainer, DataCollatorForSeq2Seq)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando:", device)

Usando: cuda


#Configurar acesso ao Kaggle

In [None]:
path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/newspaper-text-summarization-cnn-dailymail


#Carregar e preparar os dados

In [None]:
dataset = load_dataset("csv", data_files={
    "train": "/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv",
    "validation": "/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv",
    "test": "/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv"
})

train_ds = dataset["train"].select(range(5000))
val_ds   = dataset["validation"].select(range(500))
test_ds  = dataset["test"].select(range(500))

# Pré-processamento e tokenização

In [None]:
model_ckpt       = "t5-small"          # ou "facebook/bart-base"
tokenizer        = AutoTokenizer.from_pretrained(model_ckpt)
max_input_len    = 512
max_target_len   = 80

def preprocess(batch):
    inputs = tokenizer(
        ["summarize: " + text for text in batch["article"]],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    targets = tokenizer(
        batch["highlights"],
        max_length=80,
        truncation=True,
        padding="max_length"
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

In [None]:
tokenized_train = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
tokenized_val   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

tokenized_train.save_to_disk("tokenized_train")
tokenized_val.save_to_disk("tokenized_val")

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

#Treinamento (Fine-Tuning)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    fp16=True,
    save_strategy="no",
    max_steps=200,
    report_to="none",
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train.select(range(2000)),
    eval_dataset=tokenized_val.select(range(200)),
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
50,4.4248
100,3.211
150,2.545
200,2.2886


TrainOutput(global_step=200, training_loss=3.117370491027832, metrics={'train_runtime': 38.8696, 'train_samples_per_second': 20.582, 'train_steps_per_second': 5.145, 'total_flos': 108273441177600.0, 'train_loss': 3.117370491027832, 'epoch': 0.4})

#Avaliação

In [None]:
import random

rouge = evaluate.load("rouge")

def evaluate_random_prediction():
    i = random.randint(0, len(test_ds) - 1)
    article = test_ds[i]["article"]
    reference = test_ds[i]["highlights"]

    inputs = tokenizer("summarize: " + article,
                       return_tensors="pt",
                       truncation=True,
                       max_length=512).input_ids.to(model.device)

    output = model.generate(inputs, max_length=60)
    generated_summary = tokenizer.decode(output[0], skip_special_tokens=True)

    # Calcular ROUGE
    scores = rouge.compute(predictions=[generated_summary], references=[reference])

    print(f"\nNotícia aleatória (#{i}):\n", article[:300], "...\n")
    print("Resumo gerado:\n", generated_summary)
    print("Resumo referência:\n", reference)
    print("\n ROUGE scores:")
    for key, value in scores.items():
        print(f"{key}: {value:.4f}")

evaluate_random_prediction()


Notícia aleatória (#12):
 England captain Alastair Cook completed a much-needed century on the second morning of England's opening tour match in the West Indies. Cook resumed on 95 and reached three figures with minimal fuss before retiring out. England captain Alastair Cook completed a century on the second morning of Engla ...

Resumo gerado:
 Cook resumed on 95 and reached three figures with minimal fuss. he punched the ball for two off the back foot and gave a gentle wave of the bat. Ian Bell arrived at the crease, with batting time more important to tourists than attempting to force 
Resumo referência:
 Alastair Cook completed his century on the second morning of action .
England captain resumed on 95 and reached three figures before retiring .
That allowed Ian Bell to arrive at the crease as tourists continued to bat .

 ROUGE scores:
rouge1: 0.4634
rouge2: 0.2250
rougeL: 0.3659
rougeLsum: 0.4146
