## this evaluation script is inspired by VietAI's ViT5 study evaluation script: https://github.com/vietai/ViT5

In [None]:
# for running in google colab

# from google.colab import auth
# auth.authenticate_user()

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs
!pip install rouge_score

In [None]:
# the test set
!wget https://raw.githubusercontent.com/vietai/ViT5/main/data/wikilingua/test.tsv -O test-wiki.tsv
!wget https://raw.githubusercontent.com/vietai/ViT5/main/data/vietnews/test.tsv -O test-vietnews.tsv

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, BartphoTokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, MBartForConditionalGeneration, AutoModelForSeq2SeqLM
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
import torch

# model name:
# "VietAI/vit5-base" or
# "vinai/bartpho-syllable"
model_name = "VietAI/vit5-base" # the target model name
tokenizer = AutoTokenizer.from_pretrained(model_name)

# rename the model to pytorch_model.bin


# rename the corresponding config file to config.json
# the config file can be got from: https://huggingface.co/vinai/bartpho-syllable/blob/main/config.json (BARTpho-syllable large), https://huggingface.co/VietAI/vit5-base/blob/main/config.json (ViT5 base)

# put them into a directory ("./model_bin" in this case)
model = AutoModelForSeq2SeqLM.from_pretrained("./model_bin") # the directory where we have the target model
model.to('cuda')
model.eval()

In [3]:
# for bartpho syllable, got from https://github.com/VinAIResearch/BARTpho/blob/main/VietnameseToneNormalization.md
dict_map = {
    "òa": "oà",
    "Òa": "Oà",
    "ÒA": "OÀ",
    "óa": "oá",
    "Óa": "Oá",
    "ÓA": "OÁ",
    "ỏa": "oả",
    "Ỏa": "Oả",
    "ỎA": "OẢ",
    "õa": "oã",
    "Õa": "Oã",
    "ÕA": "OÃ",
    "ọa": "oạ",
    "Ọa": "Oạ",
    "ỌA": "OẠ",
    "òe": "oè",
    "Òe": "Oè",
    "ÒE": "OÈ",
    "óe": "oé",
    "Óe": "Oé",
    "ÓE": "OÉ",
    "ỏe": "oẻ",
    "Ỏe": "Oẻ",
    "ỎE": "OẺ",
    "õe": "oẽ",
    "Õe": "Oẽ",
    "ÕE": "OẼ",
    "ọe": "oẹ",
    "Ọe": "Oẹ",
    "ỌE": "OẸ",
    "ùy": "uỳ",
    "Ùy": "Uỳ",
    "ÙY": "UỲ",
    "úy": "uý",
    "Úy": "Uý",
    "ÚY": "UÝ",
    "ủy": "uỷ",
    "Ủy": "Uỷ",
    "ỦY": "UỶ",
    "ũy": "uỹ",
    "Ũy": "Uỹ",
    "ŨY": "UỸ",
    "ụy": "uỵ",
    "Ụy": "Uỵ",
    "ỤY": "UỴ",
    }


def replace_all(text):
    for i, j in dict_map.items():
        text = text.replace(i, j)
    return text


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], max_length=1024, truncation=True, padding=True
    )
    
    labels = tokenizer(
        examples["labels"], max_length=256, truncation=True, padding=True
    )
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [4]:
input_lines = []
label_lines = []
if "bartpho-syllable" in model_name:
  print("format data for bartpho-syllable")

# replace test_file.tsv with target test set
with open('test_file.tsv') as file:
  for line in file:
    line = line.strip().split('\t')
    input_line = line[0]
    label_line = line[1]
    if "bartpho-syllable" in model_name:
      input_line = replace_all(input_line)
      label_line = replace_all(label_line)

    input_lines.append(input_line)
    label_lines.append(label_line)

input_lines  = input_lines
label_lines = label_lines
dict_obj = {'inputs': input_lines, 'labels': label_lines}

dataset = Dataset.from_dict(dict_obj)
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=10)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

Map (num_proc=10):   0%|          | 0/3916 [00:00<?, ? examples/s]

In [None]:
input_lines[100] # output a sample

In [None]:
import torch 
import numpy as np
from datasets import load_metric

metrics = load_metric('rouge')
max_target_length = 256
dataloader = torch.utils.data.DataLoader(tokenized_datasets, collate_fn=data_collator, batch_size=64)

predictions = []
references = []
for i, batch in enumerate(tqdm(dataloader)):
  outputs = model.generate(
      input_ids=batch['input_ids'].to('cuda'),
      max_length=max_target_length,
      attention_mask=batch['attention_mask'].to('cuda'),
  )
  with tokenizer.as_target_tokenizer():
    outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

    labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
    actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
  predictions.extend(outputs)
  references.extend(actuals)
  metrics.add_batch(predictions=outputs, references=actuals)

metrics.compute()

In [None]:
[{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=label_lines).items()]