<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/802%20code/bleurt-evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets tqdm
!pip install git+https://github.com/google-research/bleurt.git

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from datasets import load_metric
import torch
from tqdm import tqdm

bleurt_metric = load_metric('bleurt')

t5_small_paradetox_1Token_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/t5-small-paradetox-1Token-split-masked")
t5_small_paradetox_1Token_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/t5-small-paradetox-1Token-split-masked")

# BART-base-detox (10,000 epochs with the learning rate of 3e-5)
bart_base_detox_tokenizer = AutoTokenizer.from_pretrained("s-nlp/bart-base-detox")
bart_base_detox_model = AutoModelForSeq2SeqLM.from_pretrained("s-nlp/bart-base-detox")

bart_base_paradetox_split_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/bart-base-paradetox-split")
bart_base_paradetox_split_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/bart-base-paradetox-split")


paradetox_dataset = load_dataset("HamdanXI/paradetox-split")
paradetox_1token_dataset = load_dataset("HamdanXI/paradetox-1Token-Split")

In [18]:
def max_token_length(input, label, tokenizer):
  max_token_length_input = max(len(tokenizer.encode(item)) for item in input)
  max_token_length_label = max(len(tokenizer.encode(item)) for item in label)

  if max_token_length_input > max_token_length_label:
      highest_length = max_token_length_input
  else:
      highest_length = max_token_length_label

  return highest_length

In [48]:
def generate_predictions(texts, model, tokenizer, highest_length):
    predictions = []
    for text in tqdm(texts, desc="Generating predictions"):
        inputs = tokenizer(text, padding=True, truncation=True, max_length=highest_length, return_tensors="pt")
        with torch.no_grad():
            outputs = model.generate(**inputs)
        predictions.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outputs])
    return predictions

def bleurt_evaluate(input, label, model, tokenizer, highest_length):
    predictions = generate_predictions(input, model, tokenizer, highest_length)

    score_results = bleurt_metric.compute(predictions=predictions, references=label)

    scores = score_results['scores']

    average_score = sum(scores) / len(scores) if scores else 0
    print(f"Average BLEURT Score: {average_score}")

In [21]:
highest_length_1token_t5_small = max_token_length(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], t5_small_paradetox_1Token_tokenizer)
highest_length_1token_bart_base = max_token_length(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], bart_base_detox_tokenizer)
highest_length_1token_bart_base_split = max_token_length(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], bart_base_paradetox_split_tokenizer)

In [47]:
bleurt_evaluate(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], t5_small_paradetox_1Token_model, t5_small_paradetox_1Token_tokenizer, highest_length_1token_t5_small)

Average BLEURT Score: 0.574729475563647


In [49]:
bleurt_evaluate(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], bart_base_detox_model, bart_base_detox_tokenizer, highest_length_1token_bart_base)

Generating predictions: 100%|██████████| 811/811 [05:25<00:00,  2.49it/s]


Average BLEURT Score: 0.6970971491275086


In [50]:
bleurt_evaluate(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], bart_base_paradetox_split_model, bart_base_paradetox_split_tokenizer, highest_length_1token_bart_base_split)

Generating predictions: 100%|██████████| 811/811 [05:32<00:00,  2.44it/s]


Average BLEURT Score: 0.6858602125707522


In [51]:
highest_length_t5_small = max_token_length(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], t5_small_paradetox_1Token_tokenizer)
highest_length_bart_base = max_token_length(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], bart_base_detox_tokenizer)
highest_length_bart_base_split = max_token_length(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], bart_base_paradetox_split_tokenizer)

In [52]:
bleurt_evaluate(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], t5_small_paradetox_1Token_model, t5_small_paradetox_1Token_tokenizer, highest_length_t5_small)

Generating predictions: 100%|██████████| 671/671 [02:21<00:00,  4.75it/s]


Average BLEURT Score: -0.44180790879021103


In [53]:
bleurt_evaluate(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], bart_base_detox_model, bart_base_detox_tokenizer, highest_length_bart_base)

Generating predictions: 100%|██████████| 671/671 [04:28<00:00,  2.50it/s]


Average BLEURT Score: 0.2883815431992361


In [54]:
bleurt_evaluate(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], bart_base_paradetox_split_model, bart_base_paradetox_split_tokenizer, highest_length_bart_base_split)

Generating predictions: 100%|██████████| 671/671 [05:00<00:00,  2.23it/s]


Average BLEURT Score: -0.13517926543418057
