# Package 

#### Ref
- nltk : https://medium.com/pyladies-taiwan/nltk-%E5%88%9D%E5%AD%B8%E6%8C%87%E5%8D%97-%E4%B8%80-%E7%B0%A1%E5%96%AE%E6%98%93%E4%B8%8A%E6%89%8B%E7%9A%84%E8%87%AA%E7%84%B6%E8%AA%9E%E8%A8%80%E5%B7%A5%E5%85%B7%E7%AE%B1-%E6%8E%A2%E7%B4%A2%E7%AF%87-2010fd7c7540 


- Seq2Seq : https://zhuanlan.zhihu.com/p/548722311
- Tokenizer :https://zhuanlan.zhihu.com/p/591335566

In [1]:
from rich import print
from datasets import load_dataset , load_metric
from transformers import AutoTokenizer , BatchEncoding
import nltk
import string
import torch
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# download the data
# import nltk
# nltk.download()
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device

### Loading the dataset

In [3]:
dataset = load_dataset("json" , data_files="./data/train.json")
dataset

DatasetDict({
    train: Dataset({
        features: ['headline', 'body'],
        num_rows: 100000
    })
})

In [4]:
# split the data
total_size = len(dataset["train"])
train_size = int(0.9 * total_size) 
# test_size = 
test_size_half = (total_size - train_size) // 2

print(f"train_size : {train_size} , test_size : {test_size_half}")

In [5]:
dataset_train_test = dataset["train"].train_test_split(test_size=test_size_half)
# dataset_validation = dataset_train_test["train"].train_test_split(test_size=test_size_half)

dataset["train"] = dataset_train_test["train"]
# dataset["test"] = dataset_train_test["test"]
dataset["validation"] = dataset_train_test["test"]

print(dataset)

# len(dataset["train"])

In [6]:
model_id = "t5-base"
model_file_name = model_id.replace("/" , "-")
print(f"model id : {model_id} , model file name: {model_file_name}")

### Loading the tokenizer

In [7]:
# look the note, maybe use the openAI Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer

T5TokenizerFast(name_or_path='t5-base', vocab_size=32100, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>',

In [8]:
 # TODO: change the input length
max_input_length = 1024
max_target_length = 64 

# data process

def clean_text(text:str) -> str:
    sentences = nltk.sent_tokenize(text.strip())
    sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
    
    sentences_cleaned_no_titles = [sent for sent in sentences_cleaned if len(sent) > 0 and sent[-1] in string.punctuation]
    
    text_cleaned = "\n".join(sentences_cleaned_no_titles)
    return text_cleaned


def preprocess_data(examples:dict) -> BatchEncoding:
    "headline , body"
    texts_cleaned = [clean_text(text) for text in examples["body"]]
    inputs = texts_cleaned
    
    # input to tokenizer
    model_inputs = tokenizer(inputs ,max_length=max_input_length , truncation=True,  padding=True,  return_tensors="pt") # .to(device=device) # 
    
    # label to tokenizer
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["headline"] ,  max_length=max_input_length , truncation=True, padding=True, return_tensors="pt") # .to(device=device) # max_length=max_target_length , truncation=True ,
    
    model_inputs["labels"] = labels["input_ids"]
        
    return model_inputs

### Datasets mapping by tokenizer

In [9]:
tokenized_datasets  = dataset.map(preprocess_data , batched=True)
tokenized_datasets

Map:   0%|          | 0/95000 [00:00<?, ? examples/s]

Map: 100%|██████████| 95000/95000 [02:13<00:00, 711.23 examples/s]
Map: 100%|██████████| 5000/5000 [00:07<00:00, 680.17 examples/s]


DatasetDict({
    train: Dataset({
        features: ['headline', 'body', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 95000
    })
    validation: Dataset({
        features: ['headline', 'body', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

### Loading the pre-train model

In [10]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer , Trainer , RobertaTokenizer ,RobertaModel 

In [11]:
batch_size = 10
model_name = f"data_science_hw3_model_{model_file_name}"
model_dir = f"./model/{model_name}"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=100,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [13]:
# %pip install bert_score

In [14]:
# metric_rouge = load_metric("rouge")
# metric_bert_score = load_metric("bertscore")

metric_rouge = evaluate.load("rouge",rouge_types=["rouge1", "rouge2", "rougeL"])
metric_bert_score = evaluate.load("bertscore")



In [15]:
import numpy as np

def compute_metrics(eval_pred) -> dict:
    predictions , labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions , skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    result_bert_score = metric_bert_score.compute(predictions=decoded_preds, references=decoded_labels, lang="en") # model_type="distilbert-base-uncased",
    
    # Extract ROUGE f1 scores
    result = {key: value * 100 for key, value in result.items()}
    
    # add the bert score f1 mean
    result["BERTScore f1 mean"] = np.mean(result_bert_score["f1"]) * 100
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [16]:
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_id) # .to(device)

In [17]:
trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


### open the tensorboard

In [18]:
# # Start TensorBoard before training to monitor it in progress
# %load_ext tensorboard
# %tensorboard --logdir '{model_dir}'/runs

### Training the model

In [19]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bertscore f1 mean,Gen Len
100,3.2852,0.977937,5.5908,2.743,5.0747,5.1029,13.8525,2.5806
200,0.9961,0.870797,32.7317,16.2615,29.3764,29.481,83.2466,14.702
300,0.9255,0.837946,35.9801,18.7158,32.5489,32.6599,87.6067,14.9996
400,0.8691,0.82381,36.4185,19.1559,32.9034,33.0328,87.8916,15.4528
500,0.8434,0.814815,37.3217,19.8506,33.7381,33.8659,88.0879,15.1482
600,0.8484,0.805275,37.2993,19.8605,33.8605,33.9815,88.1035,14.9012
700,0.8111,0.800759,37.8354,20.1419,34.1804,34.3094,88.1818,14.9574
800,0.8278,0.794648,37.7972,20.0511,34.1631,34.2905,88.2068,15.3954
900,0.7961,0.790707,38.1004,20.4123,34.497,34.6488,88.2592,15.2356
1000,0.7737,0.787761,38.5764,20.8542,34.9859,35.1111,88.3783,14.8082


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

### Evaluate the model in training model

In [None]:
trainer.evaluate()