Installing Libraries for Automated Metrics

In [None]:
!pip install sacrebleu

In [None]:
!pip install rouge-score

In [None]:
# Install BLEURT in Google Colab

# Upgrade pip (optional, but recommended)
!pip install --upgrade pip

# Clone the BLEURT repository from GitHub
!git clone https://github.com/google-research/bleurt.git

# Change directory to the BLEURT folder
%cd bleurt

# Install BLEURT using pip
!pip install .


In [None]:
!pip install sacrebleu rouge-score nltk bert-score torch
!git clone https://github.com/neulab/BARTScore.git
%cd BARTScore
!pip install -r requirements.txt




In [None]:
!wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip
!unzip BLEURT-20.zip -d ./BLEURT-20/

In [None]:
import pandas as pd
import sacrebleu
from rouge_score import rouge_scorer
from bleurt import score
from nltk.translate.meteor_score import meteor_score
from bert_score import score as bert_score

# Import BARTScore from the local clone
from bart_score import BARTScorer

# Ensure nltk and bert-score are installed
import nltk
nltk.download('wordnet')
nltk.download('punkt')

import warnings
from transformers import logging

# Suppress specific warnings from transformers
logging.set_verbosity_error()

# Load your CSV file
df = pd.read_csv('/content/dataset_humaneval.csv', encoding='ISO-8859-1')
# df = pd.read_csv('/content/generated_explanations_gpt-4o-2024-05-13_eval.csv', encoding='ISO-8859-1')
# df = pd.read_csv('/content/explanations_generation_evaluate.csv', encoding='ISO-8859-1')

# Columns in your CSV
generated_col = 'corrected_tweet'  # Replace with your actual column name
reference_col = 'V1'  # Replace with your actual column name


# generated_col ='Generated Explanation'
# # generated_col = df["Generated Explanation"]  # Replace with your actual column name
# reference_col = "V1"# Replace with your actual column name




# Initialize BLEURT scorer
bleurt_checkpoint = "./BLEURT-20/BLEURT-20/"  # Ensure this path points to the folder with the BLEURT files
scorer = score.BleurtScorer(bleurt_checkpoint)

# Initialize BARTScorer with CPU
bart_scorer = BARTScorer(device='cpu', checkpoint='facebook/bart-large-cnn')

# Lists to store results
bleu_scores = []
rouge_scores = []
bleurt_scores = []
meteor_scores = []
bert_scores = []
bart_scores = []

# Loop through each row in the dataframe
for idx, row in df.iterrows():
    generated_text = row[generated_col]
    reference_text = row[reference_col]

    # Tokenize the texts for METEOR score
    generated_tokens = nltk.word_tokenize(generated_text)
    reference_tokens = nltk.word_tokenize(reference_text)

    # BLEU score (normalized by dividing by 100)
    bleu = sacrebleu.corpus_bleu([generated_text], [[reference_text]])
    bleu_scores.append(bleu.score / 100.0)

    # ROUGE scores (typically already between 0 and 1)
    rouge_score = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores.append(rouge_score.score(reference_text, generated_text)['rougeL'].fmeasure)

    # BLEURT score (no need for further normalization as it is typically between 0 and 1)
    bleurt_score = scorer.score(references=[reference_text], candidates=[generated_text])
    bleurt_scores.append(bleurt_score[0])

    # METEOR score (typically between 0 and 1)
    meteor = meteor_score([reference_tokens], generated_tokens)
    meteor_scores.append(meteor)

    # BERTScore (typically between 0 and 1)
    P, R, F1 = bert_score([generated_text], [reference_text], lang='en', rescale_with_baseline=True)
    bert_scores.append(F1.mean().item())

    # BARTScore (as it is a negative log probability, we can shift it to make positive and normalize)
    bart = bart_scorer.score([generated_text], [reference_text], batch_size=4)
    # Shift and normalize BARTScore, assuming a possible range of [-10, 0]
    bart_normalized = (bart[0] + 10) / 10.0
    bart_scores.append(bart_normalized)

    # Notify that the evaluation for the current sentence is complete
    print(f"Completed evaluation for sentence {idx + 1}/{len(df)}")

# Add the normalized scores back to the dataframe
df['BLEU'] = bleu_scores
df['ROUGE'] = rouge_scores  # ROUGE-L F1 score
df['BLEURT'] = bleurt_scores
df['METEOR'] = meteor_scores
df['BERTScore'] = bert_scores
df['BARTScore'] = bart_scores

import shutil

# Save results to a new CSV file
file_path = '/content/evaluation_automated_explica.csv'
df.to_csv(file_path, index=False)

# Notify that the file is ready for download
print("Evaluation completed. Results saved to 'evaluation_results_normalized.csv'. The file is ready for download.")
