<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/ml801/evaluation_all_in_one.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets tqdm bert-score evaluate unbabel-comet
!pip install git+https://github.com/google-research/bleurt.git
!pip3 install git+https://github.com/Unbabel/COMET.git
!pip install tensorflow --upgrade

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from datasets import load_metric
import torch
from tqdm import tqdm
from evaluate import load
from bert_score import score

bleurt_metric = load_metric('bleurt')
comet_metric = load('comet')

# Models
tokenizer = AutoTokenizer.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted")
model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted")

cosine_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-cosine")
cosine_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-cosine")

euclidean_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-euclidean")
euclidean_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-euclidean")

translate_cosine_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-translated-cosine")
translate_cosine_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-translated-cosine")

# Dataset
dataset = load_dataset("HamdanXI/arb-eng-parallel-10k-splitted", split="test")

In [39]:
# Max Token Length
def max_token_length(input, label, tokenizer):
  max_token_length_input = max(len(tokenizer.encode(item)) for item in input)
  max_token_length_label = max(len(tokenizer.encode(item)) for item in label)

  if max_token_length_input > max_token_length_label:
      highest_length = max_token_length_input
  else:
      highest_length = max_token_length_label

  return highest_length

# Generate Predictions
def generate_predictions(texts, model, tokenizer, highest_length):
    predictions = []
    for text in tqdm(texts, desc="Generating predictions"):
        inputs = tokenizer(text, padding=True, truncation=True, max_length=highest_length, return_tensors="pt")
        with torch.no_grad():
            outputs = model.generate(**inputs)
        predictions.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outputs])
    return predictions

# BLEURT Evaluation
def bleurt_evaluate(input, label, model, tokenizer, highest_length):
    predictions = generate_predictions(input, model, tokenizer, highest_length)
    score_results = bleurt_metric.compute(predictions=predictions, references=label)
    scores = score_results['scores']
    average_score = sum(scores) / len(scores) if scores else 0
    print(f"Average BLEURT Score: {average_score}")

# BERT Score Evaluation
def bert_score_evaluate(input, label, model, tokenizer, highest_length):
  predictions = generate_predictions(input, model, tokenizer, highest_length)
  P, R, F1 = score(predictions, label, lang="en", rescale_with_baseline=True)
  print(f"Precision: {P.mean()}, Recall: {R.mean()}, F1 Score: {F1.mean()}")

# COMET Evaluate
def comet_evaluate(input, label, model, tokenizer, highest_length):
    predictions = generate_predictions(input, model, tokenizer, highest_length)
    comet_score = comet_metric.compute(predictions=predictions, references=label, sources=input)
    print(comet_score)

In [24]:
# Function to count the number of rows with token length greater than 512
max_length=512

def count_exceeding_token_length(inputs, labels, tokenizer, max_length=512):
    count = 0
    for input, label in zip(inputs, labels):
        input_length = len(tokenizer.encode(input, add_special_tokens=True))
        label_length = len(tokenizer.encode(label, add_special_tokens=True))
        if input_length > max_length or label_length > max_length:
            count += 1
    return count

# Example usage
exceeding_count = count_exceeding_token_length(dataset["english"], dataset["arabic"], cosine_tokenizer)
print(f"Number of rows exceeding the token length of {max_length}: {exceeding_count}")

Number of rows exceeding the token length of 512: 10


In [33]:
from datasets import Dataset

# Function to filter out rows with token length greater than 512
def filter_exceeding_token_length(inputs, labels, tokenizer, max_length=512):
    filtered_inputs = []
    filtered_labels = []
    for input, label in zip(inputs, labels):
        input_length = len(tokenizer.encode(input, add_special_tokens=True))
        label_length = len(tokenizer.encode(label, add_special_tokens=True))
        if input_length <= max_length and label_length <= max_length:
            filtered_inputs.append(input)
            filtered_labels.append(label)
    return filtered_inputs, filtered_labels

# Example usage
filtered_english, filtered_arabic = filter_exceeding_token_length(dataset["english"], dataset["arabic"], cosine_tokenizer)

filtered_data = {
    "english": filtered_english,
    "arabic": filtered_arabic
}

filtered_dataset = Dataset.from_dict(filtered_data)

In [34]:
highest_length = max_token_length(filtered_dataset["english"], filtered_dataset["arabic"], tokenizer)
highest_length_cosine = max_token_length(filtered_dataset["english"], filtered_dataset["arabic"], cosine_tokenizer)
highest_length_euclidean = max_token_length(filtered_dataset["english"], filtered_dataset["arabic"], euclidean_tokenizer)
highest_length_translate_cosine = max_token_length(filtered_dataset["english"], filtered_dataset["arabic"], translate_cosine_tokenizer)

## Evaluation

In [47]:
# Original
bleurt_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], model, tokenizer, highest_length)
bert_score_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], model, tokenizer, highest_length)
comet_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], model, tokenizer, highest_length)

Generating predictions: 100%|██████████| 990/990 [48:02<00:00,  2.91s/it]


Average BLEURT Score: 0.28746049471500545


Generating predictions: 100%|██████████| 990/990 [47:48<00:00,  2.90s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.710999071598053, Recall: 0.697315514087677, F1 Score: 0.7035643458366394


Generating predictions: 100%|██████████| 990/990 [47:58<00:00,  2.91s/it]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


{'mean_score': 0.7877256003744675, 'scores': [0.8503618240356445, 0.8322935104370117, 0.9863495826721191, 0.8695116639137268, 0.6131543517112732, 0.9731025099754333, 0.7221502661705017, 0.8463395833969116, 0.8595158457756042, 0.9640747308731079, 0.4118642807006836, 0.6832497715950012, 0.2755265235900879, 0.7395756840705872, 0.9395692944526672, 0.894148051738739, 0.6407445073127747, 0.7250490784645081, 0.7655426859855652, 0.8921614289283752, 0.6943545937538147, 0.816938042640686, 0.6922697424888611, 0.8261359930038452, 0.9668891429901123, 0.8727145791053772, 0.8731088638305664, 0.7501506209373474, 0.8831837177276611, 0.6376141905784607, 0.7875956296920776, 0.9677118062973022, 0.7823079228401184, 0.43817034363746643, 0.8163723349571228, 0.7573623061180115, 0.836251437664032, 0.7543741464614868, 0.968146562576294, 0.8070049285888672, 0.7559369206428528, 0.649654746055603, 0.9797382354736328, 0.8490793704986572, 0.6139199137687683, 0.9194795489311218, 0.9256269335746765, 0.9337344765663147

In [36]:
# Cosine
bleurt_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)

Generating predictions: 100%|██████████| 990/990 [44:00<00:00,  2.67s/it]


Average BLEURT Score: 0.2754056655397319


In [40]:
bert_score_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)

Generating predictions: 100%|██████████| 990/990 [46:44<00:00,  2.83s/it]


Downloading config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.7030388116836548, Recall: 0.6979464292526245, F1 Score: 0.7001206874847412


In [41]:
comet_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)

Generating predictions: 100%|██████████| 990/990 [44:44<00:00,  2.71s/it]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


{'mean_score': 0.7849524437177061, 'scores': [0.8859003782272339, 0.7932345271110535, 0.9863495826721191, 0.771252453327179, 0.6446283459663391, 0.9710640907287598, 0.6595632433891296, 0.8630728125572205, 0.7661566138267517, 0.9663700461387634, 0.40533286333084106, 0.6665619611740112, 0.7749037146568298, 0.8073330521583557, 0.9395692944526672, 0.8832603096961975, 0.625222384929657, 0.777237057685852, 0.6321601867675781, 0.8056350350379944, 0.6624431610107422, 0.9057794809341431, 0.796846330165863, 0.9094573855400085, 0.9633030295372009, 0.8062624335289001, 0.7677479386329651, 0.7407079935073853, 0.8293113112449646, 0.6376141905784607, 0.8830552697181702, 0.9677118062973022, 0.6444244980812073, 0.45122531056404114, 0.8629436492919922, 0.7639089822769165, 0.8473998308181763, 0.667715847492218, 0.9432080388069153, 0.7957450747489929, 0.6480064988136292, 0.8908212184906006, 0.9797382354736328, 0.8316565155982971, 0.589607298374176, 0.9147369265556335, 0.9398252964019775, 0.9335417151451111

In [42]:
# Euclidean
bleurt_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], euclidean_model, euclidean_tokenizer, highest_length_euclidean)
bert_score_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], euclidean_model, euclidean_tokenizer, highest_length_euclidean)
comet_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], euclidean_model, euclidean_tokenizer, highest_length_euclidean)

Generating predictions: 100%|██████████| 990/990 [51:25<00:00,  3.12s/it]


Average BLEURT Score: 0.28196061630968494


Generating predictions: 100%|██████████| 990/990 [51:03<00:00,  3.09s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.7134965658187866, Recall: 0.7098037600517273, F1 Score: 0.7113438248634338


Generating predictions: 100%|██████████| 990/990 [51:08<00:00,  3.10s/it]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


{'mean_score': 0.8054176590659402, 'scores': [0.8792088627815247, 0.8405627012252808, 0.9863495826721191, 0.729185163974762, 0.6384323835372925, 0.9731025099754333, 0.6768702864646912, 0.837611973285675, 0.8775690793991089, 0.9660256505012512, 0.46732357144355774, 0.7195078134536743, 0.7749037146568298, 0.8336623311042786, 0.9395692944526672, 0.9047354459762573, 0.6200940608978271, 0.7799856066703796, 0.8266271948814392, 0.8884070515632629, 0.6851984858512878, 0.8433808088302612, 0.6948868632316589, 0.7851268649101257, 0.9577919840812683, 0.8666976094245911, 0.7843926548957825, 0.816422164440155, 0.637382984161377, 0.6376141905784607, 0.8560730814933777, 0.9677118062973022, 0.7269176244735718, 0.42317840456962585, 0.8551533222198486, 0.7359763979911804, 0.8269252181053162, 0.8194593787193298, 0.9544587731361389, 0.8147139549255371, 0.6717543005943298, 0.7881066203117371, 0.9797382354736328, 0.8340060114860535, 0.6139199137687683, 0.9033762812614441, 0.9469308257102966, 0.81155282258987

In [43]:
# Translated Cosine
bleurt_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], translate_cosine_model, translate_cosine_tokenizer, highest_length_translate_cosine)
bert_score_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], translate_cosine_model, translate_cosine_tokenizer, highest_length_translate_cosine)
comet_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], translate_cosine_model, translate_cosine_tokenizer, highest_length_translate_cosine)

Generating predictions: 100%|██████████| 990/990 [47:49<00:00,  2.90s/it]


Average BLEURT Score: 0.2874516498523228


Generating predictions: 100%|██████████| 990/990 [48:04<00:00,  2.91s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Precision: 0.7147113680839539, Recall: 0.7157450914382935, F1 Score: 0.7149397134780884


Generating predictions: 100%|██████████| 990/990 [47:01<00:00,  2.85s/it]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


{'mean_score': 0.818411573255905, 'scores': [0.8880829215049744, 0.7818105220794678, 0.9863495826721191, 0.8460065722465515, 0.6645530462265015, 0.9731025099754333, 0.6548100113868713, 0.8172083497047424, 0.817066490650177, 0.9663700461387634, 0.48269224166870117, 0.7493303418159485, 0.7749037146568298, 0.8244567513465881, 0.9395692944526672, 0.911053478717804, 0.7537718415260315, 0.9387820959091187, 0.7826411724090576, 0.8921614289283752, 0.7609955668449402, 0.8640007376670837, 0.7759323716163635, 0.9101871252059937, 0.9577919840812683, 0.8639983534812927, 0.9027069211006165, 0.8163407444953918, 0.8715516924858093, 0.6376141309738159, 0.8698890805244446, 0.9677118062973022, 0.6654006242752075, 0.42317840456962585, 0.7938796877861023, 0.8259394764900208, 0.8340010046958923, 0.778099536895752, 0.945871889591217, 0.8458545804023743, 0.682015597820282, 0.7771055102348328, 0.9797382354736328, 0.8522526025772095, 0.6139199137687683, 0.8848446011543274, 0.9512689709663391, 0.8135780692100525