# Package 

#### Ref
- nltk : https://medium.com/pyladies-taiwan/nltk-%E5%88%9D%E5%AD%B8%E6%8C%87%E5%8D%97-%E4%B8%80-%E7%B0%A1%E5%96%AE%E6%98%93%E4%B8%8A%E6%89%8B%E7%9A%84%E8%87%AA%E7%84%B6%E8%AA%9E%E8%A8%80%E5%B7%A5%E5%85%B7%E7%AE%B1-%E6%8E%A2%E7%B4%A2%E7%AF%87-2010fd7c7540 


- Seq2Seq : https://zhuanlan.zhihu.com/p/548722311
- Tokenizer :https://zhuanlan.zhihu.com/p/591335566

In [45]:
from rich import print
from datasets import load_dataset , load_metric
from transformers import AutoTokenizer , BatchEncoding
import nltk
import string

In [46]:
# download the data
# import nltk
# nltk.download()

### Loading the dataset

In [47]:
dataset = load_dataset("json" , data_files="./data/train.json")
dataset

DatasetDict({
    train: Dataset({
        features: ['headline', 'body'],
        num_rows: 100000
    })
})

In [48]:
# split the data
total_size = len(dataset["train"])
train_size = int(0.9 * total_size) 
# test_size = 
test_size_half = (total_size - train_size) // 2

print(f"train_size : {train_size} , test_size : {test_size_half}")

In [49]:
dataset_train_test = dataset["train"].train_test_split(test_size=test_size_half)
dataset_validation = dataset_train_test["train"].train_test_split(test_size=test_size_half)

dataset["train"] = dataset_train_test["train"]
dataset["test"] = dataset_train_test["test"]
dataset["validation"] = dataset_validation["test"]

print(dataset)

# len(dataset["train"])

### Loading the tokenizer

In [50]:
# look the note, maybe use the openAI Tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokenizer

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [51]:
 # TODO: change the input length
max_input_length = 512
max_target_length = 64 

# data process

def clean_text(text:str) -> str:
    sentences = nltk.sent_tokenize(text.strip())
    sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
    
    sentences_cleaned_no_titles = [sent for sent in sentences_cleaned if len(sent) > 0 and sent[-1] in string.punctuation]
    
    text_cleaned = "\n".join(sentences_cleaned_no_titles)
    return text_cleaned


def preprocess_data(examples:dict) -> BatchEncoding:
    "headline , body"
    texts_cleaned = [clean_text(text) for text in examples["body"]]
    inputs = texts_cleaned
    
    # input to tokenizer
    model_inputs = tokenizer(inputs , max_length=max_input_length , truncation=True)
    
    # label to tokenizer
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["headline"] , max_length=max_target_length , truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
        
    return model_inputs

### Datasets mapping by tokenizer

In [52]:
tokenized_datasets  = dataset.map(preprocess_data , batched=True)
tokenized_datasets

Map:   0%|          | 0/95000 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Map: 100%|██████████| 95000/95000 [01:20<00:00, 1177.57 examples/s]
Map: 100%|██████████| 5000/5000 [00:04<00:00, 1165.30 examples/s]
Map: 100%|██████████| 5000/5000 [00:03<00:00, 1264.00 examples/s]


DatasetDict({
    train: Dataset({
        features: ['headline', 'body', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 95000
    })
    test: Dataset({
        features: ['headline', 'body', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['headline', 'body', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5000
    })
})

### Loading the pre-train model

In [53]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
batch_size = 8
model_name = "data_science_hw3_model_1_roberta"
model_dir = f"./model/{model_name}"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    # fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
metric = load_metric("rouge")

### Training the model

### Evaluate the model in training model