## Fine-tuned plbart model evaluation script

In [1]:
# Install required packages
!pip install transformers
!pip install huggingface_hub
!pip install rouge-score sacrebleu

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_sc

In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from transformers import PLBartTokenizer, PLBartForConditionalGeneration
import sacrebleu
from rouge_score import rouge_scorer
import torch
import json

# Load the new evaluation dataset
eval_data_path = '/content/drive/MyDrive/itrl/standardized_eval_dataset.json'
with open(eval_data_path, 'r') as file:
    dataset = json.load(file)

# Separate the dataset into inputs and references
inputs = [item['text'] for item in dataset]  # Algorithmic description in natural language
references = [item['code'] for item in dataset]  # Corresponding Python code

# Load your fine-tuned PLBart model and tokenizer
model_path = "GS-23/plbart-algo2code"
tokenizer = PLBartTokenizer.from_pretrained(model_path)
model = PLBartForConditionalGeneration.from_pretrained(model_path)

# List to store predictions
predictions = []

# Generate predictions for each input in the dataset
for input_text in inputs:
    # Tokenize the input data
    inputs_tokenized = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    # Generate predictions
    with torch.no_grad():
        output = model.generate(**inputs_tokenized, decoder_start_token_id=tokenizer.lang_code_to_id["__python__"])

    # Decode the prediction into text and store it in the predictions list
    prediction = tokenizer.decode(output[0], skip_special_tokens=True)

    # Postprocess the generated output to replace the custom token with newline and indent
    prediction = prediction.replace('__newline_indent__', '\n    ')
    predictions.append(prediction)

# Calculate the BLEU score for the entire dataset
bleu = sacrebleu.corpus_bleu(predictions, [references])
print(f"BLEU score: {bleu.score:.2f}")

# Calculate the Exact Match score for the entire dataset
exact_matches = sum([1 if pred.strip() == ref.strip() else 0 for pred, ref in zip(predictions, references)])
exact_match_score = exact_matches / len(references) * 100
print(f"Exact Match score: {exact_match_score:.2f}%")

# Calculate the chrF score for the entire dataset
chrf = sacrebleu.corpus_chrf(predictions, [references])
print(f"chrF score: {chrf.score:.2f}")

# Calculate the ROUGE score for the entire dataset
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for pred, ref in zip(predictions, references):
    scores = scorer.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Calculate average ROUGE scores
average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
average_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
average_rougeL = sum(rougeL_scores) / len(rougeL_scores)

print(f"Average ROUGE-1 score: {average_rouge1:.2f}")
print(f"Average ROUGE-2 score: {average_rouge2:.2f}")
print(f"Average ROUGE-L score: {average_rougeL:.2f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/986k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/873 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/557M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


BLEU score: 92.95
Exact Match score: 55.15%
chrF score: 95.17
Average ROUGE-1 score: 0.95
Average ROUGE-2 score: 0.91
Average ROUGE-L score: 0.95
