<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/802%20code/comet-evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets tqdm
!pip install evaluate

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from evaluate import load
import torch
from tqdm import tqdm

comet_metric = load('comet')

t5_small_paradetox_1Token_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/t5-small-paradetox-1Token-split-masked")
t5_small_paradetox_1Token_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/t5-small-paradetox-1Token-split-masked")

# BART-base-detox (10,000 epochs with the learning rate of 3e-5)
bart_base_detox_tokenizer = AutoTokenizer.from_pretrained("s-nlp/bart-base-detox")
bart_base_detox_model = AutoModelForSeq2SeqLM.from_pretrained("s-nlp/bart-base-detox")

bart_base_paradetox_split_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/bart-base-paradetox-split")
bart_base_paradetox_split_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/bart-base-paradetox-split")


paradetox_dataset = load_dataset("HamdanXI/paradetox-split")
paradetox_1token_dataset = load_dataset("HamdanXI/paradetox-1Token-Split")

In [9]:
def max_token_length(input, label, tokenizer):
  max_token_length_input = max(len(tokenizer.encode(item)) for item in input)
  max_token_length_label = max(len(tokenizer.encode(item)) for item in label)

  if max_token_length_input > max_token_length_label:
      highest_length = max_token_length_input
  else:
      highest_length = max_token_length_label

  return highest_length

In [12]:
def generate_predictions(texts, model, tokenizer, highest_length):
    predictions = []
    for text in tqdm(texts, desc="Generating predictions"):
        inputs = tokenizer(text, padding=True, truncation=True, max_length=highest_length, return_tensors="pt")
        with torch.no_grad():
            outputs = model.generate(**inputs)
        predictions.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outputs])
    return predictions

def comet_evaluate(input, label, model, tokenizer, highest_length):
    predictions = generate_predictions(input, model, tokenizer, highest_length)

    comet_score = comet_metric.compute(predictions=predictions, references=label, sources=input)

    print(comet_score)

In [13]:
highest_length_1token_t5_small = max_token_length(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], t5_small_paradetox_1Token_tokenizer)
highest_length_1token_bart_base = max_token_length(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], bart_base_detox_tokenizer)
highest_length_1token_bart_base_split = max_token_length(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], bart_base_paradetox_split_tokenizer)

In [14]:
comet_evaluate(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], t5_small_paradetox_1Token_model, t5_small_paradetox_1Token_tokenizer, highest_length_1token_t5_small)

Generating predictions: 100%|██████████| 811/811 [02:29<00:00,  5.44it/s]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'mean_score': 0.9140760001948083, 'scores': [0.9706261157989502, 0.9733459949493408, 0.8995892405509949, 0.8731541633605957, 0.8943524360656738, 0.9542368054389954, 0.9856336712837219, 0.9650577306747437, 0.9055675268173218, 0.8421628475189209, 0.9644774794578552, 0.9856791496276855, 0.8571516871452332, 0.9535357356071472, 0.951155960559845, 0.9878146052360535, 0.9732802510261536, 0.9134406447410583, 0.9092074632644653, 0.8214377760887146, 0.8920785188674927, 0.8223893642425537, 0.9489462971687317, 0.9783465266227722, 0.9894618988037109, 0.9623310565948486, 0.9479609727859497, 0.9701493382453918, 0.984663188457489, 0.9505229592323303, 0.8637273907661438, 0.8026464581489563, 0.830024242401123, 0.9203208684921265, 0.8868678212165833, 0.9835817217826843, 0.937213659286499, 0.9846112728118896, 0.9466802477836609, 0.9361201524734497, 0.7306434512138367, 0.9895156621932983, 0.9817520976066589, 0.8740952610969543, 0.9815250039100647, 0.9727117419242859, 0.9845553636550903, 0.632552444934845,

In [15]:
comet_evaluate(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], bart_base_detox_model, bart_base_detox_tokenizer, highest_length_1token_bart_base)

Generating predictions: 100%|██████████| 811/811 [05:40<00:00,  2.38it/s]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'mean_score': 0.9228425261859682, 'scores': [0.9706261157989502, 0.9485672116279602, 0.9344232678413391, 0.8936533331871033, 0.8943525552749634, 0.9542368054389954, 0.9563533067703247, 0.9707089066505432, 0.9758228659629822, 0.7061155438423157, 0.9730655550956726, 0.9856791496276855, 0.6553710103034973, 0.8929172158241272, 0.9141572117805481, 0.9792084097862244, 0.9732802510261536, 0.9095726609230042, 0.9254249930381775, 0.8214377760887146, 0.9103562831878662, 0.9719513058662415, 0.9710323214530945, 0.9783465266227722, 0.9596300721168518, 0.9389711022377014, 0.9338528513908386, 0.9701493382453918, 0.9646156430244446, 0.9426994919776917, 0.7496259212493896, 0.8713024854660034, 0.937779426574707, 0.9203208684921265, 0.9460350871086121, 0.9835817813873291, 0.9705926179885864, 0.9846112728118896, 0.8625745177268982, 0.8711992502212524, 0.932090699672699, 0.9817295670509338, 0.9817520976066589, 0.9479271769523621, 0.9687389731407166, 0.9597224593162537, 0.9845553636550903, 0.93990159034729

In [16]:
comet_evaluate(paradetox_1token_dataset['test']["en_toxic_comment"], paradetox_1token_dataset['test']["en_neutral_comment"], bart_base_paradetox_split_model, bart_base_paradetox_split_tokenizer, highest_length_1token_bart_base_split)

Generating predictions: 100%|██████████| 811/811 [05:46<00:00,  2.34it/s]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'mean_score': 0.9256481997181837, 'scores': [0.8879155516624451, 0.9485672116279602, 0.8948164582252502, 0.9418111443519592, 0.7272639870643616, 0.9542368054389954, 0.9856336712837219, 0.9707089066505432, 0.9055674076080322, 0.8735180497169495, 0.9644774794578552, 0.9856791496276855, 0.9458654522895813, 0.829631507396698, 0.9735392928123474, 0.9792084097862244, 0.9574660658836365, 0.9134406447410583, 0.9092076420783997, 0.8214378356933594, 0.6978150010108948, 0.8796635270118713, 0.954390823841095, 0.9783465266227722, 0.9596300721168518, 0.9389711022377014, 0.9479609131813049, 0.9701493382453918, 0.9646156430244446, 0.9505229592323303, 0.8637273907661438, 0.9465556740760803, 0.937779426574707, 0.9825727343559265, 0.9460350871086121, 0.9835817813873291, 0.9531048536300659, 0.9846112728118896, 0.9466802477836609, 0.9361201524734497, 0.932090699672699, 0.9817295670509338, 0.9817520976066589, 0.9763200283050537, 0.9815250039100647, 0.9597224593162537, 0.9786179661750793, 0.9669197201728821

In [17]:
highest_length_t5_small = max_token_length(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], t5_small_paradetox_1Token_tokenizer)
highest_length_bart_base = max_token_length(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], bart_base_detox_tokenizer)
highest_length_bart_base_split = max_token_length(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], bart_base_paradetox_split_tokenizer)

In [18]:
comet_evaluate(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], t5_small_paradetox_1Token_model, t5_small_paradetox_1Token_tokenizer, highest_length_t5_small)

Generating predictions: 100%|██████████| 671/671 [02:31<00:00,  4.42it/s]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'mean_score': 0.7346421954528763, 'scores': [0.7660730481147766, 0.6379521489143372, 0.8076091408729553, 0.6774418950080872, 0.6823726296424866, 0.5735591053962708, 0.8777720928192139, 0.952459454536438, 0.708164393901825, 0.5283412337303162, 0.8421065807342529, 0.5810727477073669, 0.6699661612510681, 0.41696450114250183, 0.45321470499038696, 0.7463365197181702, 0.8802365660667419, 0.871311366558075, 0.7467745542526245, 0.5592644810676575, 0.9403615593910217, 0.6845069527626038, 0.7319971323013306, 0.807745635509491, 0.6049343943595886, 0.8595831394195557, 0.7270424962043762, 0.7648990750312805, 0.9227526783943176, 0.7997810244560242, 0.6322688460350037, 0.8869372010231018, 0.6967604756355286, 0.7553336024284363, 0.6419172286987305, 0.8427456021308899, 0.8620110154151917, 0.4003101885318756, 0.6784992814064026, 0.7983162999153137, 0.5955015420913696, 0.7520713210105896, 0.3820982277393341, 0.7800993323326111, 0.6111044883728027, 0.7711215615272522, 0.8369945883750916, 0.79728102684021

In [19]:
comet_evaluate(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], bart_base_detox_model, bart_base_detox_tokenizer, highest_length_bart_base)

Generating predictions: 100%|██████████| 671/671 [04:41<00:00,  2.39it/s]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'mean_score': 0.8413672567124516, 'scores': [0.9844344258308411, 0.6427392363548279, 0.886589527130127, 0.711360514163971, 0.9706104397773743, 0.6406819224357605, 0.8990210890769958, 0.9702252745628357, 0.8432188630104065, 0.5602098107337952, 0.9070712327957153, 0.7629187703132629, 0.9156485199928284, 0.9531250596046448, 0.5078497529029846, 0.9844242930412292, 0.924683153629303, 0.9041725993156433, 0.9028647541999817, 0.5953370928764343, 0.9702807068824768, 0.9790535569190979, 0.6355805397033691, 0.882779061794281, 0.8118890523910522, 0.863642692565918, 0.6696563363075256, 0.9434171319007874, 0.9655908942222595, 0.9584135413169861, 0.8482168316841125, 0.9390712976455688, 0.9584895372390747, 0.9348630309104919, 0.8174566626548767, 0.8684192895889282, 0.8901166319847107, 0.9000057578086853, 0.9662593007087708, 0.9713380932807922, 0.9394025206565857, 0.6848529577255249, 0.9781342148780823, 0.9692235589027405, 0.8875507712364197, 0.7214944362640381, 0.8390582799911499, 0.9489404559135437,

In [20]:
comet_evaluate(paradetox_dataset['test']["en_toxic_comment"], paradetox_dataset['test']["en_neutral_comment"], bart_base_paradetox_split_model, bart_base_paradetox_split_tokenizer, highest_length_bart_base_split)

Generating predictions: 100%|██████████| 671/671 [05:04<00:00,  2.20it/s]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'mean_score': 0.7766079932850981, 'scores': [0.7660732865333557, 0.6427392363548279, 0.8939095139503479, 0.7129709124565125, 0.829850435256958, 0.6319682002067566, 0.8777720928192139, 0.9702252745628357, 0.8577783703804016, 0.512165367603302, 0.8421065211296082, 0.6418787837028503, 0.669965922832489, 0.45670434832572937, 0.6238663792610168, 0.8558434844017029, 0.924683153629303, 0.871311366558075, 0.8820770382881165, 0.669731080532074, 0.9403615593910217, 0.8168274760246277, 0.8056710362434387, 0.882779061794281, 0.7409459948539734, 0.863642692565918, 0.6696563363075256, 0.6990942358970642, 0.9227526783943176, 0.8413587212562561, 0.6185286045074463, 0.9234694838523865, 0.720320463180542, 0.9299905896186829, 0.63313227891922, 0.8684192895889282, 0.8821540474891663, 0.7690690755844116, 0.7564262747764587, 0.9461690187454224, 0.5926603078842163, 0.7757216095924377, 0.4442440867424011, 0.7649702429771423, 0.7113968133926392, 0.8274446725845337, 0.9013405442237854, 0.7887991070747375, 0.56