### Install dependencies

In [None]:
!pip install transformers datasets torch
!pip install rouge_score
!pip install evaluate

### Load dataset

In [None]:
from datasets import load_dataset


dataset = load_dataset("cnn_dailymail", "3.0.0")

dataset["train"][0], dataset["validation"][0]

### Preprocess data

In [None]:
train_subset = dataset["train"].select(range(1000))
valid_subset = dataset["validation"].select(range(100))

In [None]:
from transformers import T5Tokenizer




def tokenize_function(examples):

    model_inputs = tokenizer(examples['article'], padding="max_length", truncation=True, max_length=512)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], padding="max_length", truncation=True, max_length=150)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_train_dataset= train_subset.map(tokenize_function, batched=True)
tokenized_validation_dataset= valid_subset.map(tokenize_function, batched=True)

### Load model

In [None]:
from transformers import T5ForConditionalGeneration


model = T5ForConditionalGeneration.from_pretrained("t5-small")


for param in model.encoder.block[:5]:
    for p in param.parameters():
        p.requires_grad = False


In [None]:
from transformers import Trainer, TrainingArguments


model.config.dropout_rate = 0.3


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=2000,
    save_steps=2000,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    gradient_accumulation_steps=2,
    fp16=True,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    lr_scheduler_type="linear",
    warmup_steps=1000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer
)


### Train the model on 1,000 examples

In [None]:
trainer.train()


### Evaluate

In [None]:
results = trainer.evaluate()

print(results)

### Predict results on test data

In [None]:
import torch

device_0 = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device_0)

In [None]:
import evaluate


rouge = evaluate.load("rouge")

def evaluate_model_on_test(model, dataset, tokenizer):
    predictions = []
    references = []

    device = model.device

    for example in dataset:
        article = example["article"]
        reference = example["highlights"]

        inputs = tokenizer(article, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        summary_ids = model.generate(inputs["input_ids"], max_length=150, num_beams=4, early_stopping=True)
        prediction = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        predictions.append(prediction)
        references.append(reference)

    results = rouge.compute(predictions=predictions, references=references)
    return results

test_results = evaluate_model_on_test(model, dataset["test"].select(range(100)), tokenizer)

for key, value in test_results.items():
    print(f"{key}: {value}")


In [None]:
from transformers import T5ForConditionalGeneration

model_untrained = T5ForConditionalGeneration.from_pretrained("t5-small")


article = dataset["validation"][0]["article"]

def generate_summary(model, text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

    inputs = inputs.to(model.device)

    summary_ids = model.generate(inputs["input_ids"], max_length=150, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

summary_model = generate_summary(model, article)

summary_model_untrained = generate_summary(model_untrained, article)

print("Untrained model Summary:")
print(summary_model_untrained)
print("\nTrained model Summary:")
print(summary_model)

### Train model on 100,000 examples

### Select data

In [None]:
train_subset_2 = dataset["train"].select(range(100000))
valid_subset_2 = dataset["validation"].select(range(10000))

In [None]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize_function(examples):

    model_inputs = tokenizer(examples['article'], padding="max_length", truncation=True, max_length=512)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], padding="max_length", truncation=True, max_length=150)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_train_dataset_2= train_subset_2.map(tokenize_function, batched=True)
tokenized_validation_dataset_2= valid_subset_2.map(tokenize_function, batched=True)


### Load model

In [None]:
from transformers import T5ForConditionalGeneration

model_2 = T5ForConditionalGeneration.from_pretrained("t5-small")

for param in model_2.encoder.block[:5]:
    for p in param.parameters():
        p.requires_grad = False

In [None]:
from transformers import Trainer, TrainingArguments

model_2.config.dropout_rate = 0.3

training_args_2 = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=2000,
    save_steps=2000,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    gradient_accumulation_steps=2,
    fp16=True,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    lr_scheduler_type="linear",
    warmup_steps=1000,
)

trainer_2 = Trainer(
    model=model_2,
    args=training_args_2,
    train_dataset=tokenized_train_dataset_2,
    eval_dataset=tokenized_validation_dataset_2,
    tokenizer=tokenizer
)


### Train model

In [None]:
trainer_2.train()

### Evaluate

In [None]:
results = trainer_2.evaluate()

print(results)

### Check performance

In [None]:
import torch

device_0 = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_2.to(device_0)

In [None]:
import torch
import evaluate

rouge = evaluate.load("rouge")

def evaluate_model_on_test(model, dataset, tokenizer):
    predictions = []
    references = []

    device = model.device

    for example in dataset:
        article = example["article"]

        reference = example["highlights"]

        inputs = tokenizer(article, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        summary_ids = model.generate(inputs["input_ids"], max_length=150, num_beams=4, early_stopping=True)
        prediction = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        predictions.append(prediction)
        references.append(reference)

    results = rouge.compute(predictions=predictions, references=references)
    return results

test_results = evaluate_model_on_test(model_2, dataset["test"].select(range(100)), tokenizer)

for key, value in test_results.items():
    print(f"{key}: {value}")

### Model performance comparison

In [None]:
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

    for k, v in inputs.items():
        inputs[k] = v.to(model_2.device)

    summary_ids = model_2.generate(inputs["input_ids"], max_length=350, num_beams=8, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


model_untrained = T5ForConditionalGeneration.from_pretrained("t5-small")

article = dataset["validation"][0]["article"]

summary_model_untrained = generate_summary(model_untrained, article)
summary_model_trained_1000 = generate_summary(model_2, article)
summary_model_trained_100000 = generate_summary(model_2, article)

print("Untrained model Summary:")
print(summary_model_untrained)
print("\nTrained model on 1,000 examples:")
print(summary_model_trained_1000)
print("\nTrained model on 100,000 examples:")
print(summary_model_trained_100000)

### Save model

In [None]:
import os

output_dir = "./t5_finetuned_model_2"
os.makedirs(output_dir, exist_ok=True)

model_2.save_pretrained(output_dir)

tokenizer.save_pretrained(output_dir)

In [None]:
import shutil

shutil.make_archive("./t5_finetuned_model_2", "zip", output_dir)