<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/Model_Evaluation_T5_Small_Gloss_to_Text_VERY_GOOD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install torch
!pip install bert_score

In [10]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset, load_metric
import torch

# Load model and tokenizer
checkpoint = "HamdanXI/t5_small_gloss_merged_dataset"
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = T5Tokenizer.from_pretrained(checkpoint)

# Load dataset
dataset = load_dataset("aslg_pc12")

# Take a percentage (e.g., 5%) of the train set for evaluation
eval_dataset = dataset["train"].train_test_split(test_size=0.1)["test"]

# Tokenize the evaluation dataset
input_texts = eval_dataset["gloss"]
target_texts = eval_dataset["text"]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.63k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
max_token_length_gloss = max(len(tokenizer.encode(item)) for item in eval_dataset["gloss"])
max_token_length_text = max(len(tokenizer.encode(item)) for item in eval_dataset["text"])

print(f"Max token length for gloss: {max_token_length_gloss}")
print(f"Max token length for text: {max_token_length_text}")

Max token length for gloss: 105
Max token length for text: 76


In [None]:
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=110)
labels = tokenizer(target_texts, return_tensors="pt", padding=True, truncation=True, max_length=110).input_ids

# Make predictions
model.eval()

with torch.no_grad():
    outputs = model.generate(inputs.input_ids)
decoded_predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Compute BERTScore
metric = load_metric("bertscore")
score = metric.compute(predictions=decoded_predictions, references=target_texts, lang="en")

# Compute the mean values
mean_precision = sum(score['precision']) / len(score['precision'])
mean_recall = sum(score['recall']) / len(score['recall'])
mean_f1 = sum(score['f1']) / len(score['f1'])

# similarly, you can compute mean_recall and mean_f1 if needed

print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")
print(f"Mean F1: {mean_f1}")

