Common machine-translation evaluation metrics are demonstrated using simple, controlled examples.
The goal is to build intuition for how CER, WER, and BLEU behave under different prediction errors.

In [2]:
!pip install -q torchmetrics

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/983.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.9/983.2 kB[0m [31m11.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
from torchmetrics.text import CharErrorRate, WordErrorRate, SacreBLEUScore
from torchmetrics.text import SacreBLEUScore

In [8]:
# perfect prediction
predicted = ["this is a test"]
expected  = ["this is a test"]

cer = CharErrorRate()(predicted, expected)
wer = WordErrorRate()(predicted, expected)
bleu = SacreBLEUScore(smooth=True, tokenize="13a")(predicted, [[expected[0]]])

print("CER:", cer.item())
print("WER:", wer.item())
print("BLEU:", bleu.item())

CER: 0.0
WER: 0.0
BLEU: 1.0


In [10]:
# one word wrong
predicted = ["this is a demo"]
expected  = ["this is a test"]

cer = CharErrorRate()(predicted, expected)
wer = WordErrorRate()(predicted, expected)
bleu = SacreBLEUScore(smooth=True, tokenize="13a")(predicted, [[expected[0]]])

print("CER:", cer.item())
print("WER:", wer.item())
print("BLEU:", bleu.item())

CER: 0.2142857164144516
WER: 0.25
BLEU: 0.0


In [12]:
# same words, different order
predicted = ["is this a test"]
expected  = ["this is a test"]

cer = CharErrorRate()(predicted, expected)
wer = WordErrorRate()(predicted, expected)
bleu = SacreBLEUScore(smooth=True, tokenize="13a")(predicted, [[expected[0]]])

print("CER:", cer.item())
print("WER:", wer.item())
print("BLEU:", bleu.item())

CER: 0.2857142984867096
WER: 0.5
BLEU: 0.0


In [13]:
# completely wrong prediction
predicted = ["this example is incorrect"]
expected  = ["this is a test"]

cer = CharErrorRate()(predicted, expected)
wer = WordErrorRate()(predicted, expected)
bleu = SacreBLEUScore(smooth=True, tokenize="13a")(predicted, [[expected[0]]])

print("CER:", cer.item())
print("WER:", wer.item())
print("BLEU:", bleu.item())

CER: 1.0714285373687744
WER: 0.75
BLEU: 0.0


In [14]:
# multiple sentences (corpus behavior)
predicted = [
    "this is a test",
    "machine translation is hard"
]

expected = [
    "this is a test",
    "machine translation is difficult"
]

cer = CharErrorRate()(predicted, expected)
wer = WordErrorRate()(predicted, expected)
bleu = SacreBLEUScore(smooth=True, tokenize="13a")(predicted, [[e] for e in expected])

print("CER:", cer.item())
print("WER:", wer.item())
print("BLEU:", bleu.item())

CER: 0.19565217196941376
WER: 0.125
BLEU: 0.7952707409858704


CER measures character-level edit distance.

WER measures word-level substitutions, insertions, and deletions.

BLEU measures n-gram overlap and is highly sensitive to phrasing and word order, especially on small samples, reaching its maximum value only when predictions exactly match the reference.