In [1]:
# In this notebook, you learn:
#
# 1) How to measure the quality of a machine translation model?
# 2) How to calculate BLEU score?

In [None]:
# Resources to understand BLEU score:
#
# 1) https://youtu.be/DejHQYAGb7Q?si=no3A70rxzxnFywXd
#       -- Video by Andrew NG explaining BLEU score and how to calculate it.
# 2) https://blog.modernmt.com/understanding-mt-quality-bleu-scores/
#       -- Blog post explaining the advantages and disadvantages of BLEU score.
# 3) https://towardsdatascience.com/foundations-of-nlp-explained-bleu-score-and-wer-metrics-1a5ba06d812b
#       -- Blog post explaining the math behind BLEU score clearly.
# 4) https://docs.google.com/document/d/1OPldZW_9NbG8JLywnqJ91yJV9olP9npG8FJG9csnpwc/edit?tab=t.0#bookmark=id.cbldv5yohjf9
#       -- Google document explaining why BLEU score is always in the range [0, 1].
#       -- This is a conversation between me and Gemini.
# 5) https://docs.google.com/document/d/1OPldZW_9NbG8JLywnqJ91yJV9olP9npG8FJG9csnpwc/edit?tab=t.0#bookmark=id.2uvpt29pxts9
#       -- Google document explaining how BLEU score is calculated for a corpus of translations as opposed to a single translation.
# 6) https://docs.google.com/document/d/1OPldZW_9NbG8JLywnqJ91yJV9olP9npG8FJG9csnpwc/edit?tab=t.0#bookmark=id.nfzxm12zp3bu
#       -- Google document showing a running example of how BLEU score is calculated for a corpus.

### Lets see how to use 'sacrebleu' package to calculate the BELU score.

In [None]:
import evaluate

In [3]:
sacrebleu = evaluate.load("sacrebleu")

Downloading builder script: 100%|██████████| 8.15k/8.15k [00:00<00:00, 16.6MB/s]


In [4]:
sacrebleu

EvaluationModule(name: "sacrebleu", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
    smooth_method (`str`): The smoothing method to use, defaults to `'e

In [5]:
predictions_1 = ["Hello My Name is Kabil"]
references_1 = [
    ["I am Kabil", "My Name is Kabil"]
]

In [6]:
# Lets go through the output returned by sacrebleu and understand what each term means.
# Please note that though these explanations work here for this example, they might not be accurate for all cases
# because the tokenization used internally by sacrebleu might be different and the words might be tokenized 
# differently which might affect the n-grams produced.
# 
# score (66.874): This is the BLEU score, given as a percentage. The actual BLEU score is this number divided by 100 which is 0.66 in this case.
# counts ([4, 3, 2, 1]): This gives the number of n-grams matched in the candidate and reference translations.
#       -- counts[0] = 4: Number of matching uni-grams i.e., single words.
#               -- My, Name, is, Kabil
#       -- counts[1] = 3: Number of matching bi-grams i.e., two words.
#               -- My Name, Name is, is Kabil
#       -- counts[2] = 2: Number of matching tri-grams i.e., three words.
#               -- My Name is, Name is Kabil
#       -- counts[3] = 1: Number of matching 4-grams i.e., four words.
#               -- My Name is Kabil
# totals ([4, 3, 2, 1]): This gives the total number of n-grams in the candidate translations.
#       -- totals[0] = 5: Number of uni-grams in the candidate translation.
#               -- Hello, My, Name, is, Kabil
#       -- totals[1] = 4: Number of bi-grams in the candidate translation.
#               -- Hello My, My Name, Name is, is Kabil
#       -- totals[2] = 3: Number of tri-grams in the candidate translation.
#               -- Hello My Name, My Name is, Name is Kabil
#       -- totals[3] = 2: Number of 4-grams in the candidate translation.
#               -- Hello My Name is, My Name is Kabil
# precisions ([0.8, 0.75, 0.6666666666666666, 0.5]): This gives the precision for each n-gram size (Not each individual n-gram).
#       -- precision for each n-gram size is calculated as follows: counts[n] / totals[n]. This is multiplied by 100 to get the percentage.
#       -- precisions[0] = counts[0] / totals[0] = 4/5 = 0.8 
#       -- precisions[1] = counts[1] / totals[1] = 3/4 = 0.75
#       -- precisions[2] = counts[2] / totals[2] = 2/3 = 0.6666666666666666
#       -- precisions[3] = counts[3] / totals[3] = 1/2 = 0.5
# sys_len (5): This is the length of the candidate translation.
# ref_len (4): This is the length of the reference translations.
#       -- I DIDN'T UNDERSTAND WHY THIS IS 4. MY GUESS FOR NOW IS THAT THIS IS BECAUSE OF THE TOKENIZATION BEING USED INTERNALLY
#          BY SACREBLEU.
# bp (1.0): This is the brevity penalty. This is 1.0 because the candidate translation is longer than the reference translations i.e.,
#           sys_len > ref_len.
bleu_1 = sacrebleu.compute(predictions=predictions_1, references=references_1)
print(type(bleu_1))
print(bleu_1)

<class 'dict'>
{'score': 66.87403049764218, 'counts': [4, 3, 2, 1], 'totals': [5, 4, 3, 2], 'precisions': [80.0, 75.0, 66.66666666666667, 50.0], 'bp': 1.0, 'sys_len': 5, 'ref_len': 4}


In [7]:
predictions_2 = ["Hey, that is the great Sachin Tendulkar", "It is not easy to learn machine learning from scratch in a short time"]
references_2 = [
    ["That is sachin tendulkar", "Great Sachin Tendulkar is here"],
    ["Not easy to learn machine learning from scratch in a short time", "It is hard to learn machine learning in a short time"]
]

In [8]:
bleu_2 = sacrebleu.compute(predictions=predictions_2, references=references_2)
print(bleu_2)

{'score': 57.47078645171894, 'counts': [16, 12, 9, 8], 'totals': [22, 20, 18, 16], 'precisions': [72.72727272727273, 60.0, 50.0, 50.0], 'bp': 1.0, 'sys_len': 22, 'ref_len': 17}


### Now, Lets see how to use 'nltk' library to calculate the BELU score.

In [12]:
from nltk.translate.bleu_score import corpus_bleu

In [None]:
# Notice that the BLEU scores calculated using nltk are different from the BLEU scores calculated using sacrebleu.
# This could be because of the different tokenization used by sacrebleu and nltk or because of the different way in which
# the brevity penalty is calculated. 
# 
# Calculate BLEU score for the first set of predictions and references
bleu_1_nltk = corpus_bleu(references_1, predictions_1)
print(f"BLEU score for predictions_1 using nltk: {bleu_1_nltk}")

# Calculate BLEU score for the second set of predictions and references
bleu_2_nltk = corpus_bleu(references_2, predictions_2)
print(f"BLEU score for predictions_2 using nltk: {bleu_2_nltk}")

BLEU score for predictions_1 using nltk: 0.7062594378058554
BLEU score for predictions_2 using nltk: 0.8447728290556981
