# BLEU Score for Unigrams

In [1]:
original = "Der schnelle braune Fuchs sprang ueber den faulen Hund"

reference =   "The quick brown fox jumped over the lazy dog"
candidate_1 = "The fast  brown fox leaped over the      dog"
candidate_2 = "The swift brown fox jumped over the lazy dog"
candidate_3 = "The swift tawny fox leaped over the indolent canine."

### NLTK

In [2]:
#pip install nltk

In [3]:
from nltk.translate.bleu_score import sentence_bleu

bleu_nltk_1 = sentence_bleu([reference.split()], candidate_1.split(), weights=[1.])
bleu_nltk_2 = sentence_bleu([reference.split()], candidate_2.split(), weights=[1.])
bleu_nltk_3 = sentence_bleu([reference.split()], candidate_3.split(), weights=[1.])

print(f"BLEU score for example 1: {bleu_nltk_1:.2f}")
print(f"BLEU score for example 2: {bleu_nltk_2:.2f}")
print(f"BLEU score for example 3: {bleu_nltk_3:.2f}")

BLEU score for example 1: 0.66
BLEU score for example 2: 0.89
BLEU score for example 3: 0.44


### TorchMetrics

In [5]:
from torchmetrics import BLEUScore

bleu = BLEUScore(n_gram=1)

# Calculate BLEU scores
bleu_tm_1 = bleu(target=[[reference]], preds=[candidate_1])
bleu_tm_2 = bleu(target=[[reference]], preds=[candidate_2])
bleu_tm_3 = bleu(target=[[reference]], preds=[candidate_3])

print(f"BLEU score for example 1: {bleu_tm_1:.2f}")
print(f"BLEU score for example 2: {bleu_tm_2:.2f}")
print(f"BLEU score for example 3: {bleu_tm_3:.2f}")

BLEU score for example 1: 0.66
BLEU score for example 2: 0.89
BLEU score for example 3: 0.44


### From Scratch

In [6]:
import math
from collections import Counter

def ngrams(sentence, n):
    return [tuple(sentence[i:i+n]) for i in range(len(sentence)-n+1)]

def modified_precision(reference, candidate, n):
    ref_ngrams = Counter(ngrams(reference, n))
    cand_ngrams = Counter(ngrams(candidate, n))

    count_clip = sum(min(cand_ngrams[ng], ref_ngrams[ng]) for ng in cand_ngrams)
    count_total = sum(cand_ngrams.values())

    return count_clip / count_total if count_total > 0 else 0

def brevity_penalty(reference, candidate):
    ref_len = len(reference)
    cand_len = len(candidate)

    if cand_len > ref_len:
        return 1
    elif cand_len == 0:
        return 0
    else:
        return math.exp(1 - ref_len / cand_len)

def bleu_score_unigram(reference, candidate):
    bp = brevity_penalty(reference, candidate)
    precision = modified_precision(reference, candidate, n=1)

    return bp * precision


bleu_scratch_1 = bleu_score_unigram(reference=reference.split(), candidate=candidate_1.split())
bleu_scratch_2 = bleu_score_unigram(reference=reference.split(), candidate=candidate_2.split())
bleu_scratch_3 = bleu_score_unigram(reference=reference.split(), candidate=candidate_3.split())

print(f"BLEU score for example 1: {bleu_scratch_1:.2f}")
print(f"BLEU score for example 2: {bleu_scratch_2:.2f}")
print(f"BLEU score for example 3: {bleu_scratch_3:.2f}")

BLEU score for example 1: 0.66
BLEU score for example 2: 0.89
BLEU score for example 3: 0.44


# BLEU Score for 4-grams ("default" BLEU)

In [7]:
# Example 1
candidate_1 = "The quick brown dog jumps over the lazy fox"
references_1 = [
    "The quick brown fox jumps over the lazy dog",
    "The fast brown fox leaps over the lazy dog",
]

# Example 2
candidate_2 = "The small red car drives quickly down the road"
references_2 = [
    "The small red car races quickly along the road",
    "A small red car speeds rapidly down the avenue",
]

## NLTK

In [8]:
from nltk.translate.bleu_score import sentence_bleu

bleu_nltk_1 = sentence_bleu([r.split() for r in references_1], candidate_1.split())
bleu_nltk_2 = sentence_bleu([r.split() for r in references_2], candidate_2.split())

print(f"BLEU score for example 1: {bleu_nltk_1:.2f}")
print(f"BLEU score for example 2: {bleu_nltk_2:.2f}")

BLEU score for example 1: 0.46
BLEU score for example 2: 0.40


## TorchMetrics

In [9]:
from torchmetrics import BLEUScore

bleu = BLEUScore(n_gram=4)

# Calculate BLEU scores
bleu_tm_1 = bleu(target=[references_1], preds=[candidate_1])
bleu_tm_2 = bleu(target=[references_2], preds=[candidate_2])

print(f"BLEU score for example 1: {bleu_tm_1:.2f}")
print(f"BLEU score for example 2: {bleu_tm_2:.2f}")

BLEU score for example 1: 0.46
BLEU score for example 2: 0.40


## From Scratch

In [10]:
import math
from collections import Counter
from fractions import Fraction

def tokenize(sentence):
    return sentence.lower().split()

def ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def modified_precision(candidate, references, n):
    candidate_ngrams = Counter(ngrams(candidate, n))
    max_reference_counts = Counter()

    for reference in references:
        reference_ngrams = Counter(ngrams(reference, n))
        for ngram in candidate_ngrams:
            max_reference_counts[ngram] = max(max_reference_counts[ngram], reference_ngrams[ngram])

    clipped_counts = {
        ngram: min(count, max_reference_counts[ngram])
        for ngram, count in candidate_ngrams.items()
    }

    numerator = sum(clipped_counts.values())
    denominator = sum(candidate_ngrams.values())

    if denominator == 0:
        return 0
    return Fraction(numerator, denominator)

def closest_reference_length(candidate, references):
    ref_lens = [len(reference) for reference in references]
    candidate_len = len(candidate)
    closest_ref_len = min(ref_lens, key=lambda ref_len: (abs(ref_len - candidate_len), ref_len))
    return closest_ref_len

def brevity_penalty(candidate, references):
    candidate_length = len(candidate)
    closest_ref_len = closest_reference_length(candidate, references)

    if candidate_length > closest_ref_len:
        return 1
    else:
        return math.exp(1 - closest_ref_len / candidate_length)

def sentence_bleu_scratch(candidate, references, weights=(0.25, 0.25, 0.25, 0.25)):
    candidate_tokens = tokenize(candidate)
    reference_tokens = [tokenize(reference) for reference in references]

    precisions = [
        modified_precision(candidate_tokens, reference_tokens, n+1)
        for n in range(len(weights))
    ]

    if all(p == 0 for p in precisions):
        return 0

    precision_product = math.exp(
        sum(w * math.log(float(p)) for w, p in zip(weights, precisions) if p != 0)
    )
    bp = brevity_penalty(candidate_tokens, reference_tokens)
    bleu = bp * precision_product

    return min(bleu, 1)  # Ensure the BLEU score is between 0 and 1


bleu_score_scratch_1 = sentence_bleu_scratch(candidate_1, references_1)
bleu_score_scratch_2 = sentence_bleu_scratch(candidate_2, references_2)

print(f"BLEU score for example 1: {bleu_score_scratch_1:.2f}")
print(f"BLEU score for example 2: {bleu_score_scratch_2:.2f}")

BLEU score for example 1: 0.46
BLEU score for example 2: 0.40
