<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/802%20code/bert-score-evaluation-t5-small.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets bert-score tqdm

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from bert_score import score
import torch

t5_small_paradetox_1Token_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/t5-small-paradetox-1Token-split-masked")
t5_small_paradetox_1Token_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/t5-small-paradetox-1Token-split-masked")


# BART-base-detox (10,000 epochs with the learning rate of 3e-5)
bart_base_detox_tokenizer = AutoTokenizer.from_pretrained("s-nlp/bart-base-detox")
bart_base_detox_model = AutoModelForSeq2SeqLM.from_pretrained("s-nlp/bart-base-detox")


paradetox_dataset = load_dataset("s-nlp/paradetox")
paradetox_1token_dataset = load_dataset("HamdanXI/paradetox-1Token-Split")

In [12]:
import random

random_indices = random.sample(range(len(paradetox_dataset['train'])), 671)

paradetox_randomSample_dataset = paradetox_dataset['train'].select(random_indices)

paradetox_randomSample_dataset

Dataset({
    features: ['en_toxic_comment', 'en_neutral_comment'],
    num_rows: 671
})

In [4]:
def max_token_length(input, label, tokenizer):
  max_token_length_input = max(len(tokenizer.encode(item)) for item in input)
  max_token_length_label = max(len(tokenizer.encode(item)) for item in label)

  if max_token_length_input > max_token_length_label:
      highest_length = max_token_length_input
  else:
      highest_length = max_token_length_label

  return highest_length

In [5]:
from tqdm import tqdm

def generate_predictions(texts, model, tokenizer, highest_length):
    predictions = []
    for text in tqdm(texts, desc="Generating predictions"):
        inputs = tokenizer(text, padding=True, truncation=True, max_length=highest_length, return_tensors="pt")
        with torch.no_grad():
            outputs = model.generate(**inputs)
        predictions.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outputs])
    return predictions

def bert_score_evaluate(input, label, model, tokenizer, highest_length):
  predictions = generate_predictions(input, model, tokenizer, highest_length)

  # Compute BERT Score
  P, R, F1 = score(predictions, label, lang="en", rescale_with_baseline=True)

  # Compute and print the average scores
  print(f"Precision: {P.mean()}, Recall: {R.mean()}, F1 Score: {F1.mean()}")

In [13]:
highest_length_1token_t5_small = max_token_length(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], t5_small_paradetox_1Token_tokenizer)
highest_length_1token_bart_base = max_token_length(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], bart_base_detox_tokenizer)

In [7]:
bert_score_evaluate(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], t5_small_paradetox_1Token_model, t5_small_paradetox_1Token_tokenizer, highest_length_1token_t5_small)

Generating predictions: 100%|██████████| 811/811 [02:43<00:00,  4.96it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.89088374376297, Recall: 0.8717514276504517, F1 Score: 0.8811373710632324


In [8]:
bert_score_evaluate(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], bart_base_detox_model, bart_base_detox_tokenizer, highest_length_1token_bart_base)

Generating predictions: 100%|██████████| 811/811 [05:46<00:00,  2.34it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.8873310685157776, Recall: 0.8524273633956909, F1 Score: 0.8696255087852478


In [14]:
highest_length_t5_small = max_token_length(paradetox_randomSample_dataset["en_toxic_comment"], paradetox_randomSample_dataset["en_neutral_comment"], t5_small_paradetox_1Token_tokenizer)
highest_length_bart_base = max_token_length(paradetox_randomSample_dataset["en_toxic_comment"], paradetox_randomSample_dataset["en_neutral_comment"], bart_base_detox_tokenizer)

In [15]:
bert_score_evaluate(paradetox_randomSample_dataset["en_toxic_comment"], paradetox_randomSample_dataset["en_neutral_comment"], t5_small_paradetox_1Token_model, t5_small_paradetox_1Token_tokenizer, highest_length_t5_small)

Generating predictions: 100%|██████████| 671/671 [02:36<00:00,  4.30it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.6002827882766724, Recall: 0.6506548523902893, F1 Score: 0.6244639754295349


In [16]:
bert_score_evaluate(paradetox_randomSample_dataset["en_toxic_comment"], paradetox_randomSample_dataset["en_neutral_comment"], bart_base_detox_model, bart_base_detox_tokenizer, highest_length_bart_base)

Generating predictions: 100%|██████████| 671/671 [04:52<00:00,  2.30it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.7774385213851929, Recall: 0.7530778646469116, F1 Score: 0.7647563219070435
