In [None]:
# install the necessary libraries

!pip install evaluate rouge_score bert_score sentence_transformers

In [4]:
# Step 2: Import the main library
import evaluate
import warnings
warnings.filterwarnings("ignore") # Hides a benign UserWarning from Sentence-BERT

# Step 3: Define our candidate and reference sentences
# The goal is to show the difference between lexical and semantic similarity.
candidate_sentence = "a durable jacket for hiking"
reference_sentence = "a long-lasting coat for trekking"

# Notice that they mean almost the exact same thing, but use completely different words."
print("--- Data for Evaluation ---")
print(f"Candidate Generation: '{candidate_sentence}'")
print(f"Reference Text:       '{reference_sentence}'")
print("-" * 29, "\n")

--- Data for Evaluation ---
Candidate Generation: 'a durable jacket for hiking'
Reference Text:       'a long-lasting coat for trekking'
----------------------------- 



In [5]:
# Step 4: Evaluate with ROUGE (Lexical, Recall-based)

print("--- Evaluating with ROUGE (Lexical Recall) ---")
rouge_metric = evaluate.load('rouge')

results_rouge = rouge_metric.compute(
    predictions=[candidate_sentence],
    references=[[reference_sentence]]
)

# Print the results
print("ROUGE scores are low because there is very little word-for-word overlap.")
print(f"ROUGE-L Score: {results_rouge['rougeL']:.4f}")
print("-" * 46, "\n")

# Step 5: Evaluate with BLEU (Lexical, Precision-based)

print("--- Evaluating with BLEU (Lexical Precision) ---")
bleu_metric = evaluate.load('bleu')

# Compute the score
results_bleu = bleu_metric.compute(
    predictions=[candidate_sentence],
    references=[[reference_sentence]]
)

# Print the results
print("BLEU score is also low, as it relies on matching n-grams (phrases).")
print(f"BLEU Score: {results_bleu['bleu']:.4f}")
print("-" * 49, "\n")


# Step 6: Evaluate with BERTScore (Semantic)

print("--- Evaluating with BERTScore (Semantic Similarity) ---")
bertscore_metric = evaluate.load('bertscore')

# Compute the score
# We specify a model type to ensure consistent results.
results_bert = bertscore_metric.compute(
    predictions=[candidate_sentence],
    references=[[reference_sentence]],
    model_type="distilbert-base-uncased"
)

# BERTScore returns precision, recall, and F1. F1 is the harmonic mean and a great single-number summary.
# We get the values from the list since we only passed one sentence pair.
bert_f1_score = results_bert['f1'][0]

print("BERTScore understands that 'durable' is similar to 'long-lasting' and 'jacket' is similar to 'coat'.")
print(f"BERTScore F1-Score: {bert_f1_score:.4f}")
print("-" * 55, "\n")

--- Evaluating with ROUGE (Lexical Recall) ---


Downloading builder script: 0.00B [00:00, ?B/s]

ROUGE scores are low because there is very little word-for-word overlap.
ROUGE-L Score: 0.3636
---------------------------------------------- 

--- Evaluating with BLEU (Lexical Precision) ---


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

BLEU score is also low, as it relies on matching n-grams (phrases).
BLEU Score: 0.0000
------------------------------------------------- 

--- Evaluating with BERTScore (Semantic Similarity) ---


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

BERTScore understands that 'durable' is similar to 'long-lasting' and 'jacket' is similar to 'coat'.
BERTScore F1-Score: 0.8906
------------------------------------------------------- 

