In [1]:
!pip install sacrebleu transformers torch

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m51.2/51.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, colo

In [3]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=3608de780c45ae3e08f61c37899d6c8b3c41ac6310f6bcde58da988f54d9d65c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import sacrebleu
from rouge_score import rouge_scorer
import torch
import json

# Load the new evaluation dataset
new_dataset_path = '/content/New_Evaluation_Dataset.json'
with open(new_dataset_path, 'r') as file:
    dataset = json.load(file)

# Separate the dataset into inputs and references
inputs = [item['text'] for item in dataset]
references = [item['code'] for item in dataset]

# Load your Hugging Face model and tokenizer
model_name = "AshArya/ITRLTrained"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, from_tf = True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# List to store predictions
predictions = []

# Generate predictions for each input in the dataset
for input_text in inputs:
    # Tokenize the input data
    inputs_tokenized = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    # Generate predictions
    with torch.no_grad():
        output = model.generate(**inputs_tokenized)

    # Decode the prediction into text and store it in the predictions list
    prediction = tokenizer.decode(output[0], skip_special_tokens=True)
    predictions.append(prediction)

# Calculate the BLEU score for the entire dataset
bleu = sacrebleu.corpus_bleu(predictions, [references])
print(f"BLEU score: {bleu.score}")

# Calculate the Exact Match score for the entire dataset
exact_matches = sum([1 if pred.strip() == ref.strip() else 0 for pred, ref in zip(predictions, references)])
exact_match_score = exact_matches / len(references) * 100
print(f"Exact Match score: {exact_match_score:.2f}%")

# Calculate the chrF score for the entire dataset
chrf = sacrebleu.corpus_chrf(predictions, [references])
print(f"chrF score: {chrf.score:.2f}")

# Calculate the ROUGE score for the entire dataset
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for pred, ref in zip(predictions, references):
    scores = scorer.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Calculate average ROUGE scores
average_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
average_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
average_rougeL = sum(rougeL_scores) / len(rougeL_scores)

print(f"Average ROUGE-1 score: {average_rouge1:.2f}")
print(f"Average ROUGE-2 score: {average_rouge2:.2f}")
print(f"Average ROUGE-L score: {average_rougeL:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All TF 2.0 model weights were used when initializing T5ForConditionalGeneration.

Some weights of T5ForConditionalGeneration were not initialized from the TF 2.0 model and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BLEU score: 87.64200982025153
Exact Match score: 81.62%
chrF score: 94.55
Average ROUGE-1 score: 0.93
Average ROUGE-2 score: 0.88
Average ROUGE-L score: 0.93
