<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/ml801/evaluation_all_in_one.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets tqdm bert-score evaluate unbabel-comet
!pip install git+https://github.com/google-research/bleurt.git
!pip3 install git+https://github.com/Unbabel/COMET.git
!pip install tensorflow --upgrade

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from datasets import load_metric
import torch
from tqdm import tqdm
from evaluate import load

bleurt_metric = load_metric('bleurt')
comet_metric = load('comet')

# Models
cosine_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-cosine")
cosine_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-cosine")

euclidean_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-euclidean")
euclidean_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-euclidean")

translate_cosine_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-translated-cosine")
translate_cosine_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-translated-cosine")

# Dataset
dataset = load_dataset("HamdanXI/arb-eng-parallel-10k-splitted", split="test")



Downloading builder script:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

  warn(


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading README.md:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

Downloading LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

Downloading hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

Downloading .gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`


Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:177: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


Downloading tokenizer_config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

Downloading source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

Downloading target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]



Downloading config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/305M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

Downloading source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

Downloading target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/305M [00:00<?, ?B/s]

In [None]:
def max_token_length(input, label, tokenizer):
  max_token_length_input = max(len(tokenizer.encode(item)) for item in input)
  max_token_length_label = max(len(tokenizer.encode(item)) for item in label)

  if max_token_length_input > max_token_length_label:
      highest_length = max_token_length_input
  else:
      highest_length = max_token_length_label

  return highest_length

In [None]:
def generate_predictions(texts, model, tokenizer, highest_length):
    predictions = []
    for text in tqdm(texts, desc="Generating predictions"):
        inputs = tokenizer(text, padding=True, truncation=True, max_length=highest_length, return_tensors="pt")
        with torch.no_grad():
            outputs = model.generate(**inputs)
        predictions.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outputs])
    return predictions

# BLEURT Evaluation
def bleurt_evaluate(input, label, model, tokenizer, highest_length):
    predictions = generate_predictions(input, model, tokenizer, highest_length)
    score_results = bleurt_metric.compute(predictions=predictions, references=label)
    scores = score_results['scores']
    average_score = sum(scores) / len(scores) if scores else 0
    print(f"Average BLEURT Score: {average_score}")

# BERT Score Evaluation
def bert_score_evaluate(input, label, model, tokenizer, highest_length):
  predictions = generate_predictions(input, model, tokenizer, highest_length)
  P, R, F1 = score(predictions, label, lang="en", rescale_with_baseline=True)
  print(f"Precision: {P.mean()}, Recall: {R.mean()}, F1 Score: {F1.mean()}")

# COMET Evaluate
def comet_evaluate(input, label, model, tokenizer, highest_length):
    predictions = generate_predictions(input, model, tokenizer, highest_length)
    comet_score = comet_metric.compute(predictions=predictions, references=label, sources=input)
    print(comet_score)

In [None]:
highest_length_cosine = max_token_length(dataset["english"], dataset["arabic"], cosine_tokenizer)
highest_length_euclidean = max_token_length(dataset["english"], dataset["arabic"], euclidean_tokenizer)
highest_length_translate_cosine = max_token_length(dataset["english"], dataset["arabic"], translate_cosine_tokenizer)

## Evaluation

In [None]:
# Cosine
bleurt_evaluate(dataset["english"], dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)
# bert_score_evaluate(dataset["english"], dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)
# comet_evaluate(dataset["english"], dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)

Average BLEURT Score: 0.574729475563647


In [None]:
bert_score_evaluate(dataset["english"], dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)

In [None]:
comet_evaluate(dataset["english"], dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)

In [None]:
# Euclidean
bleurt_evaluate(dataset["english"], dataset["arabic"], euclidean_model, euclidean_tokenizer, highest_length_euclidean)
bert_score_evaluate(dataset["english"], dataset["arabic"], euclidean_model, euclidean_tokenizer, highest_length_euclidean)
comet_evaluate(dataset["english"], dataset["arabic"], euclidean_model, euclidean_tokenizer, highest_length_euclidean)

Generating predictions: 100%|██████████| 811/811 [05:25<00:00,  2.49it/s]


Average BLEURT Score: 0.6970971491275086


In [None]:
# Translated Cosine
bleurt_evaluate(dataset["english"], dataset["arabic"], translate_cosine_model, translate_cosine_tokenizer, highest_length_1token_bart_base)
bert_score_evaluate(dataset["english"], dataset["arabic"], translate_cosine_model, translate_cosine_tokenizer, highest_length_1token_t5_small)
comet_evaluate(dataset["english"], dataset["arabic"], translate_cosine_model, translate_cosine_tokenizer, highest_length_translate_cosine)

Generating predictions: 100%|██████████| 811/811 [05:32<00:00,  2.44it/s]


Average BLEURT Score: 0.6858602125707522
