# Package 

#### Ref
- nltk : https://medium.com/pyladies-taiwan/nltk-%E5%88%9D%E5%AD%B8%E6%8C%87%E5%8D%97-%E4%B8%80-%E7%B0%A1%E5%96%AE%E6%98%93%E4%B8%8A%E6%89%8B%E7%9A%84%E8%87%AA%E7%84%B6%E8%AA%9E%E8%A8%80%E5%B7%A5%E5%85%B7%E7%AE%B1-%E6%8E%A2%E7%B4%A2%E7%AF%87-2010fd7c7540 


- Seq2Seq : https://zhuanlan.zhihu.com/p/548722311
- Tokenizer :https://zhuanlan.zhihu.com/p/591335566

In [1]:
from rich import print
from datasets import load_dataset , load_metric
from transformers import AutoTokenizer , BatchEncoding
import nltk
import string

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# download the data
# import nltk
# nltk.download()

### Loading the dataset

In [3]:
dataset = load_dataset("json" , data_files="./data/train.json")
dataset

DatasetDict({
    train: Dataset({
        features: ['headline', 'body'],
        num_rows: 100000
    })
})

In [4]:
# split the data
total_size = len(dataset["train"])
train_size = int(0.9 * total_size) 
# test_size = 
test_size_half = (total_size - train_size) // 2

print(f"train_size : {train_size} , test_size : {test_size_half}")

In [5]:
dataset_train_test = dataset["train"].train_test_split(test_size=test_size_half)
dataset_validation = dataset_train_test["train"].train_test_split(test_size=test_size_half)

dataset["train"] = dataset_train_test["train"]
dataset["test"] = dataset_train_test["test"]
dataset["validation"] = dataset_validation["test"]

print(dataset)

# len(dataset["train"])

In [6]:
model_id = "facebook/bart-large"
model_file_name = model_id.replace("/" , "-")
print(f"model id : {model_id} , model file name: {model_file_name}")

### Loading the tokenizer

In [7]:
# look the note, maybe use the openAI Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer

BartTokenizerFast(name_or_path='facebook/bart-large', vocab_size=50265, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}

In [8]:
 # TODO: change the input length
max_input_length = 1024
max_target_length = 64 

# data process

def clean_text(text:str) -> str:
    sentences = nltk.sent_tokenize(text.strip())
    sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
    
    sentences_cleaned_no_titles = [sent for sent in sentences_cleaned if len(sent) > 0 and sent[-1] in string.punctuation]
    
    text_cleaned = "\n".join(sentences_cleaned_no_titles)
    return text_cleaned


def preprocess_data(examples:dict) -> BatchEncoding:
    "headline , body"
    texts_cleaned = [clean_text(text) for text in examples["body"]]
    inputs = texts_cleaned
    
    # input to tokenizer
    model_inputs = tokenizer(inputs ,max_length=max_input_length , truncation=True,  padding=True,  return_tensors="pt") # 
    
    # label to tokenizer
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["headline"] ,  max_length=max_input_length , truncation=True, padding=True, return_tensors="pt") # max_length=max_target_length , truncation=True ,
    
    model_inputs["labels"] = labels["input_ids"]
        
    return model_inputs

### Datasets mapping by tokenizer

In [9]:
tokenized_datasets  = dataset.map(preprocess_data , batched=True)
tokenized_datasets

Map:   0%|          | 0/95000 [00:00<?, ? examples/s]

Map: 100%|██████████| 95000/95000 [01:38<00:00, 966.29 examples/s] 
Map: 100%|██████████| 5000/5000 [00:04<00:00, 1022.77 examples/s]
Map: 100%|██████████| 5000/5000 [00:04<00:00, 1022.19 examples/s]


DatasetDict({
    train: Dataset({
        features: ['headline', 'body', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 95000
    })
    test: Dataset({
        features: ['headline', 'body', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['headline', 'body', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

### Loading the pre-train model

In [10]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer , Trainer , RobertaTokenizer ,RobertaModel 

In [11]:
batch_size = 10
model_name = f"data_science_hw3_model_{model_file_name}"
model_dir = f"./model/{model_name}"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [13]:
metric = load_metric("rouge")

  metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [14]:
import numpy as np

def compute_metrics(eval_pred) -> dict:
    predictions , labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions , skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [15]:
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [16]:
trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


### open the tensorboard

In [17]:
# # Start TensorBoard before training to monitor it in progress
# %load_ext tensorboard
# %tensorboard --logdir '{model_dir}'/runs

### Training the model

In [18]:
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

### Evaluate the model in training model