<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/ml801/evaluation_all_in_one.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets tqdm bert-score evaluate unbabel-comet
!pip install git+https://github.com/google-research/bleurt.git
!pip3 install git+https://github.com/Unbabel/COMET.git
!pip install tensorflow --upgrade

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from datasets import load_metric
import torch
from tqdm import tqdm
from evaluate import load

bleurt_metric = load_metric('bleurt')
comet_metric = load('comet')

# Models
cosine_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-cosine")
cosine_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-cosine")

euclidean_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-euclidean")
euclidean_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-euclidean")

translate_cosine_tokenizer = AutoTokenizer.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-translated-cosine")
translate_cosine_model = AutoModelForSeq2SeqLM.from_pretrained("HamdanXI/marefa-mt-en-ar-parallel-10k-splitted-translated-cosine")

# Dataset
dataset = load_dataset("HamdanXI/arb-eng-parallel-10k-splitted", split="test")

In [5]:
# Max Token Length
def max_token_length(input, label, tokenizer):
  max_token_length_input = max(len(tokenizer.encode(item)) for item in input)
  max_token_length_label = max(len(tokenizer.encode(item)) for item in label)

  if max_token_length_input > max_token_length_label:
      highest_length = max_token_length_input
  else:
      highest_length = max_token_length_label

  return highest_length

# Generate Predictions
def generate_predictions(texts, model, tokenizer, highest_length):
    predictions = []
    for text in tqdm(texts, desc="Generating predictions"):
        inputs = tokenizer(text, padding=True, truncation=True, max_length=highest_length, return_tensors="pt")
        with torch.no_grad():
            outputs = model.generate(**inputs)
        predictions.extend([tokenizer.decode(output, skip_special_tokens=True) for output in outputs])
    return predictions

# BLEURT Evaluation
def bleurt_evaluate(input, label, model, tokenizer, highest_length):
    predictions = generate_predictions(input, model, tokenizer, highest_length)
    score_results = bleurt_metric.compute(predictions=predictions, references=label)
    scores = score_results['scores']
    average_score = sum(scores) / len(scores) if scores else 0
    print(f"Average BLEURT Score: {average_score}")

# BERT Score Evaluation
def bert_score_evaluate(input, label, model, tokenizer, highest_length):
  predictions = generate_predictions(input, model, tokenizer, highest_length)
  P, R, F1 = score(predictions, label, lang="en", rescale_with_baseline=True)
  print(f"Precision: {P.mean()}, Recall: {R.mean()}, F1 Score: {F1.mean()}")

# COMET Evaluate
def comet_evaluate(input, label, model, tokenizer, highest_length):
    predictions = generate_predictions(input, model, tokenizer, highest_length)
    comet_score = comet_metric.compute(predictions=predictions, references=label, sources=input)
    print(comet_score)

In [24]:
# Function to count the number of rows with token length greater than 512
max_length=512

def count_exceeding_token_length(inputs, labels, tokenizer, max_length=512):
    count = 0
    for input, label in zip(inputs, labels):
        input_length = len(tokenizer.encode(input, add_special_tokens=True))
        label_length = len(tokenizer.encode(label, add_special_tokens=True))
        if input_length > max_length or label_length > max_length:
            count += 1
    return count

# Example usage
exceeding_count = count_exceeding_token_length(dataset["english"], dataset["arabic"], cosine_tokenizer)
print(f"Number of rows exceeding the token length of {max_length}: {exceeding_count}")

Number of rows exceeding the token length of 512: 10


In [33]:
# Function to filter out rows with token length greater than 512
def filter_exceeding_token_length(inputs, labels, tokenizer, max_length=512):
    filtered_inputs = []
    filtered_labels = []
    for input, label in zip(inputs, labels):
        input_length = len(tokenizer.encode(input, add_special_tokens=True))
        label_length = len(tokenizer.encode(label, add_special_tokens=True))
        if input_length <= max_length and label_length <= max_length:
            filtered_inputs.append(input)
            filtered_labels.append(label)
    return filtered_inputs, filtered_labels

# Example usage
filtered_english, filtered_arabic = filter_exceeding_token_length(dataset["english"], dataset["arabic"], cosine_tokenizer)

filtered_data = {
    "english": filtered_english,
    "arabic": filtered_arabic
}

filtered_dataset = Dataset.from_dict(filtered_data)

In [34]:
highest_length_cosine = max_token_length(filtered_dataset["english"], filtered_dataset["arabic"], cosine_tokenizer)
highest_length_euclidean = max_token_length(filtered_dataset["english"], filtered_dataset["arabic"], euclidean_tokenizer)
highest_length_translate_cosine = max_token_length(filtered_dataset["english"], filtered_dataset["arabic"], translate_cosine_tokenizer)

## Evaluation

In [None]:
# Cosine
bleurt_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)
# bert_score_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)
# comet_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)

Generating predictions:  44%|████▎     | 431/990 [18:49<29:54,  3.21s/it]

In [None]:
bert_score_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)

In [None]:
comet_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], cosine_model, cosine_tokenizer, highest_length_cosine)

In [None]:
# Euclidean
bleurt_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], euclidean_model, euclidean_tokenizer, highest_length_euclidean)
bert_score_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], euclidean_model, euclidean_tokenizer, highest_length_euclidean)
comet_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], euclidean_model, euclidean_tokenizer, highest_length_euclidean)

Generating predictions: 100%|██████████| 811/811 [05:25<00:00,  2.49it/s]


Average BLEURT Score: 0.6970971491275086


In [None]:
# Translated Cosine
bleurt_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], translate_cosine_model, translate_cosine_tokenizer, highest_length_translate_cosine)
bert_score_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], translate_cosine_model, translate_cosine_tokenizer, highest_length_translate_cosine)
comet_evaluate(filtered_dataset["english"], filtered_dataset["arabic"], translate_cosine_model, translate_cosine_tokenizer, highest_length_translate_cosine)

Generating predictions: 100%|██████████| 811/811 [05:32<00:00,  2.44it/s]


Average BLEURT Score: 0.6858602125707522
