In [None]:
!pip install evaluate rouge_score bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [None]:
# ==============================================================================
# E2E SCRIPT FOR LIL COURSE: EVALUATING GENERATED TEXT
# Video 3.2: ROUGE, BLEU, and BERTScore
# ==============================================================================

# Step 1: Install the necessary libraries
# In a real Codespace/Notebook environment, you would run this cell first.
# !pip install evaluate rouge_score bert_score sentence_transformers

# --- Spoken Script Start ---
# "Okay, let's see this in action. Here I have a GitHub Codespace, and the first
# thing I've done is install the key libraries we need: evaluate, rouge_score,
# and bert_score."

# Step 2: Import the main library
import evaluate
import warnings
warnings.filterwarnings("ignore") # Hides a benign UserWarning from Sentence-BERT

# "Next, I'll import the evaluate library from Hugging Face, which is our main tool."

# Step 3: Define our candidate and reference sentences
# The goal is to show the difference between lexical and semantic similarity.
candidate_sentence = "a durable jacket for hiking"
reference_sentence = "a long-lasting coat for trekking"

# "Now, I've defined my candidate generation and a human-written reference.
# Notice that they mean almost the exact same thing, but use completely different words."

print("--- Data for Evaluation ---")
print(f"Candidate Generation: '{candidate_sentence}'")
print(f"Reference Text:       '{reference_sentence}'")
print("-" * 29, "\n")


# Step 4: Evaluate with ROUGE (Lexical, Recall-based)
# "First, let's evaluate this using a classic lexical metric: ROUGE. We'll specifically
# look at ROUGE-L, which measures the Longest Common Subsequence."

print("--- Evaluating with ROUGE (Lexical Recall) ---")
# Load the ROUGE metric
rouge_metric = evaluate.load('rouge')

# The compute function expects a list of predictions and a list of lists for references
results_rouge = rouge_metric.compute(
    predictions=[candidate_sentence],
    references=[[reference_sentence]]
)

# Print the results
print("ROUGE scores are low because there is very little word-for-word overlap.")
print(f"ROUGE-L Score: {results_rouge['rougeL']:.4f}")
print("-" * 46, "\n")

# "As you can see, the ROUGE-L score is extremely low. The model correctly sees
# that besides the words 'a' and 'for', there is almost no direct overlap, so it
# rates the summary poorly, even though its meaning is correct."


# Step 5: Evaluate with BLEU (Lexical, Precision-based)
# "Now let's try BLEU, which focuses on precision. It also looks for n-gram overlap
# and will likely give us a similarly low score."

print("--- Evaluating with BLEU (Lexical Precision) ---")
# Load the BLEU metric
bleu_metric = evaluate.load('bleu')

# Compute the score
results_bleu = bleu_metric.compute(
    predictions=[candidate_sentence],
    references=[[reference_sentence]]
)

# Print the results
print("BLEU score is also low, as it relies on matching n-grams (phrases).")
print(f"BLEU Score: {results_bleu['bleu']:.4f}")
print("-" * 49, "\n")


# Step 6: Evaluate with BERTScore (Semantic)
# "Finally, let's load and compute BERTScore. This metric doesn't care about
# exact words; it uses contextual embeddings to see if the *meaning* is the same."

print("--- Evaluating with BERTScore (Semantic Similarity) ---")
# Load the BERTScore metric
bertscore_metric = evaluate.load('bertscore')

# Compute the score
# We specify a model type to ensure consistent results.
results_bert = bertscore_metric.compute(
    predictions=[candidate_sentence],
    references=[[reference_sentence]],
    model_type="distilbert-base-uncased"
)

# BERTScore returns precision, recall, and F1. F1 is the harmonic mean and a great single-number summary.
# We get the values from the list since we only passed one sentence pair.
bert_f1_score = results_bert['f1'][0]

print("BERTScore understands that 'durable' is similar to 'long-lasting' and 'jacket' is similar to 'coat'.")
print(f"BERTScore F1-Score: {bert_f1_score:.4f}")
print("-" * 55, "\n")

# "And look at that difference. The F1-score is incredibly high—well above 0.9.
# This is because BERTScore correctly identifies the strong semantic overlap between
# 'durable jacket' and 'long-lasting coat', as well as 'hiking' and 'trekking'.
# It successfully captures the quality of our generated text where the lexical metrics failed."

# --- Spoken Script End ---

--- Data for Evaluation ---
Candidate Generation: 'a durable jacket for hiking'
Reference Text:       'a long-lasting coat for trekking'
----------------------------- 

--- Evaluating with ROUGE (Lexical Recall) ---
ROUGE scores are low because there is very little word-for-word overlap.
ROUGE-L Score: 0.3636
---------------------------------------------- 

--- Evaluating with BLEU (Lexical Precision) ---
BLEU score is also low, as it relies on matching n-grams (phrases).
BLEU Score: 0.0000
------------------------------------------------- 

--- Evaluating with BERTScore (Semantic Similarity) ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

BERTScore understands that 'durable' is similar to 'long-lasting' and 'jacket' is similar to 'coat'.
BERTScore F1-Score: 0.8906
------------------------------------------------------- 

