# HW3
---
### Ref 
- seq2seq : https://medium.com/nlplanet/a-full-guide-to-finetuning-t5-for-text2text-and-building-a-demo-with-streamlit-c72009631887
- t5 : https://learnopencv.com/fine-tuning-t5/

In [1]:
# %pip install datasets transformers rouge_score nltk

In [2]:
from rich import print
from datasets import load_dataset , load_metric
from transformers import AutoTokenizer , BatchEncoding , T5Tokenizer, T5ForConditionalGeneration,TrainingArguments , Trainer,DataCollatorForSeq2Seq,Seq2SeqTrainingArguments,Seq2SeqTrainer
import nltk
import string
import torch
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("json" , data_files="./data/train.json")
dataset

DatasetDict({
    train: Dataset({
        features: ['headline', 'body'],
        num_rows: 100000
    })
})

In [4]:
# split the data
total_size = len(dataset["train"])
train_size = int(0.9 * total_size) 
# test_size = 
test_size_half = (total_size - train_size) // 2

print(f"train_size : {train_size} , test_size : {test_size_half}")

In [5]:
dataset_train_test = dataset["train"].train_test_split(test_size=test_size_half)
dataset_validation = dataset_train_test["train"].train_test_split(test_size=test_size_half)

dataset["train"] = dataset_train_test["train"]
dataset["test"] = dataset_train_test["test"]
dataset["validation"] = dataset_validation["test"]

print(dataset)

# len(dataset["train"])

In [6]:
# https://huggingface.co/google/flan-t5-base
model_id = "google/flan-t5-base"
model_file_name = model_id.replace("/" , "-")
print(f"model id : {model_id} , model file name: {model_file_name}")

In [7]:
tokenizer = T5Tokenizer.from_pretrained(model_id)
tokenizer

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5Tokenizer(name_or_path='google/flan-t5-base', vocab_size=32000, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '

In [10]:
 # TODO: change the input length
max_input_length = 2048
max_target_length = 64 
prefix = "summarize: "
# data process

def clean_text(text:str) -> str:
    sentences = nltk.sent_tokenize(text.strip())
    sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
    
    sentences_cleaned_no_titles = [sent for sent in sentences_cleaned if len(sent) > 0 and sent[-1] in string.punctuation]
    
    text_cleaned = "\n".join(sentences_cleaned_no_titles)
    return text_cleaned


def preprocess_data(examples:dict) -> BatchEncoding:
    "headline , body"
    texts_cleaned = [clean_text(text) for text in examples["body"]]
    # add prefix
    inputs = [prefix + text for text in texts_cleaned]
    
    # input to tokenizer
    model_inputs = tokenizer(inputs  , truncation=True,  padding=True, return_tensors="pt") # .to(device=device) # max_length=max_input_length, 
    
    # label to tokenizer
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["headline"] ,truncation=True,  padding=True,  return_tensors="pt") # .to(device=device) # max_length=max_target_length , truncation=True , max_length=max_input_length , truncation=True, padding=True,
    
    model_inputs["labels"] = labels["input_ids"]
        
    return model_inputs

In [11]:
tokenized_datasets  = dataset.map(preprocess_data , batched=True)
tokenized_datasets

Map:   0%|          | 0/95000 [00:00<?, ? examples/s]

Map: 100%|██████████| 95000/95000 [04:11<00:00, 377.84 examples/s]
Map: 100%|██████████| 5000/5000 [00:13<00:00, 383.83 examples/s]
Map: 100%|██████████| 5000/5000 [00:13<00:00, 380.48 examples/s]


DatasetDict({
    train: Dataset({
        features: ['headline', 'body', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 95000
    })
    test: Dataset({
        features: ['headline', 'body', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['headline', 'body', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

In [12]:
batch_size = 10
model_name = f"data_science_hw3_model_{model_file_name}"
model_dir = f"./model/{model_name}"

In [13]:
model = T5ForConditionalGeneration.from_pretrained(model_id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")

total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"{total_trainable_params:,} training parameters.")

In [14]:
EPOCHS = 1
BATCH_SIZE = 10

In [15]:
training_args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=EPOCHS,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [17]:
metric_rouge = evaluate.load("rouge",rouge_types=["rouge1", "rouge2", "rougeL"])
metric_bert_score = evaluate.load("bertscore")


In [18]:
import numpy as np

def compute_metrics(eval_pred) -> dict:
    predictions , labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions , skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)
    # model_type="distilbert-base-uncased",
    result_bert_score = metric_bert_score.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    
    # Extract ROUGE f1 scores
    result = {key: value * 100 for key, value in result.items()}
    
    # add the bert score f1 mean
    result["BERTScore f1 mean"] = np.mean(result_bert_score["f1"]) * 100
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [21]:
def model_init():
    return model

In [22]:
trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [23]:
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# history

In [None]:
tokenizer.save_pretrained(model_dir)