# Jupyter Notebook for training model

Note: use smaller learning_rate to prevent huge changes (i.e. forgetting)

In [None]:
# load model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_folder = "./t5_small_finetuned"
model = AutoModelForSeq2SeqLM.from_pretrained(model_folder)
tokenizer = AutoTokenizer.from_pretrained(model_folder)

In [2]:
# load dataset
from datasets import load_from_disk

dataset = load_from_disk("cnn_dailymail_full")

In [10]:
# take subsets of dataset
train_dataset = dataset["train"].shuffle(seed=42).select(range(8000))
val_dataset = dataset["validation"].shuffle(seed=42).select(range(1000))
test_dataset = dataset["test"].shuffle(seed=42).select(range(1000))

# preprocess datasets
max_input_length = 512
max_target_length = 128


def preprocess(batch):
    inputs = ["summarize: " + article for article in batch["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    labels = tokenizer(batch["highlights"], max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# train model
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_small_finetuned_training",  # save model in dir
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    logging_steps=100,
    save_steps=100,
    eval_steps=10,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=1,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
)
trainer.train()

In [None]:
# continue training from last checkpoint if interrupted
trainer.train(resume_from_checkpoint="./t5_small_finetuned_training/checkpoint-500")

In [7]:
# load model from checkpoint
checkpoint_folder = "./t5_efficient_mini_finetuned_training/checkpoint-100"
# model_test = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_folder)
# tokenizer_test = AutoTokenizer.from_pretrained(checkpoint_folder)

# save final model (which can then be loaded as in tutorial) (adapt before execution!)
model.save_pretrained("./t5_small_finetuned")
tokenizer.save_pretrained("./t5_small_finetuned")

('./t5_small_finetuned\\tokenizer_config.json',
 './t5_small_finetuned\\special_tokens_map.json',
 './t5_small_finetuned\\tokenizer.json')

# Tutorials

In [None]:
# get output of model
article = """The mayor of the city announced new environmental policies today aimed at reducing emissions 
    by 40% by 2035. The initiative includes expanded public transit, incentives for electric vehicles, and stricter 
    regulations on industrial polluters."""
article = test_dataset[2]["article"]
print("Article:\n" + article)

input_text = "summarize: " + article  # article with task prefix

inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)

summary_ids = model.generate(
    inputs["input_ids"], max_length=128, num_beams=4, length_penalty=1.0, no_repeat_ngram_size=2, early_stopping=True
)

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:\n" + summary)
print("Reference Summary:\n" + test_dataset[2]["highlights"])

Article:
The mayor of the city announced new environmental policies today aimed at reducing emissions 
    by 40% by 2035. The initiative includes expanded public transit, incentives for electric vehicles, and stricter 
    regulations on industrial polluters.
Generated Summary:
The initiative includes expanded public transit, incentives for electric vehicles and stricter regulations on industrial polluters.
Reference Summary:
The presidential hopeful held a town hall meeting in Kenilworth on Tuesday .
During the meeting, high school English teacher Kathy Mooney got up to ask the governor a question about pensions .
She asked why he didn't seek a higher legal settlement in a case with ExxonMobil that would have contributed to the state's pension system .
Christie responded by repeatedly asking how much Mooney knew about the deal instead of answering her question .


In [None]:
# loading and saving models
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# load model
model = AutoModelForSeq2SeqLM.from_pretrained("./t5_efficient_mini")
tokenizer = AutoTokenizer.from_pretrained("./t5_efficient_mini")

# save model
# model_raw.save_pretrained("./t5_efficient_mini")
# tokenizer_raw.save_pretrained("./t5_efficient_mini")

In [None]:
# load and saving datasets
from datasets import load_from_disk

# load dataset
dataset = load_from_disk("cnn_dailymail_full")
print(dataset["train"][0])

# save dataset (no additional import needed)
# dataset.save_to_disk("test")

# Archive

In [None]:
# finished successfully
# download cnn_dailymail dataset from internet and save it locally
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")
# dataset.save_to_disk("cnn_dailymail_full")

In [None]:
# finished successfully
# download raw T5-efficient-mini from internet and save it locally
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google/t5-efficient-mini"
model_raw = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer_raw = AutoTokenizer.from_pretrained(model_name)

# model_raw.save_pretrained("./t5_efficient_mini")
# tokenizer_raw.save_pretrained("./t5_efficient_mini")