In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/anantraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [90]:
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import numpy as np

def aggregate_bleu_score(refs, sys, n=2):
    
    p_ns = np.zeros(n)
    c = 0  
    r = 0  

    for predicted in sys:
        predicted_tokens = word_tokenize(predicted)
        reference_tokens = [word_tokenize(ref[0]) for ref in refs]

        c += len(predicted_tokens)
        r += min([len(target) for target in reference_tokens], key=lambda x: abs(x - len(predicted_tokens)))

        for i in range(1, n + 1):
            pred_ngrams = list(ngrams(predicted_tokens, i, pad_right=True, right_pad_symbol=None))
            max_ref_ngrams = {}
            for ref_tokens in reference_tokens:

                ref_ngrams = list(ngrams(ref_tokens, i, pad_right=True, right_pad_symbol=None))
                ref_ngram_counts = Counter(ref_ngrams)
                for ngram in ref_ngram_counts:
                    if ngram in max_ref_ngrams:
                        max_ref_ngrams[ngram] = max(max_ref_ngrams[ngram], ref_ngram_counts[ngram])
                    else:
                        max_ref_ngrams[ngram] = ref_ngram_counts[ngram]

            clipped_count = sum(min(count, max_ref_ngrams.get(ngram, 0)) for ngram, count in Counter(pred_ngrams).items())
            total_count = len(pred_ngrams)
            p_ns[i - 1] += clipped_count / total_count if total_count > 0 else 0

    #print('p_ns',p_ns)
    p_ns = p_ns / len(sys)  # Average precision per n-gram 
    brevity_penalty = np.exp(1 - r / c) if c < r else 1

    bleu_score = brevity_penalty * np.exp(sum(np.log(p) for p in p_ns) / n)
    return bleu_score

# # Example usage with your provided input
refs = [['Sì, verrete in'],
        ['Siete ferito, signore?']]
sys = ['spietate stabilirmi ',
       'Siete spietate anticamera signore ?']

# refs = [['Alice is'],['She loves UK more than states']]
# sys = ['Alice loves UK','US ']

bs = aggregate_bleu_score(refs, sys)
bs

0.1840735921322381

In [87]:
def dynamic_aggregate_bleu_score(refs, sys, max_n=4):
    
    p_ns = np.zeros(max_n)
    c = 0  # Total length of system outputs
    r = 0  # Total length of the closest reference lengths

    for predicted in sys:
        predicted_tokens = word_tokenize(predicted)
        reference_tokens = [word_tokenize(ref[0]) for ref in refs]

        c += len(predicted_tokens)
        r += min([len(target) for target in reference_tokens], key=lambda x: abs(x - len(predicted_tokens)))

        # Find the smallest sentence length among the current predicted sentence and all references
        min_length = min([len(predicted_tokens)] + [len(ref) for ref in reference_tokens])

        # Determine the highest n-gram order that makes sense given the sentence lengths
        effective_n = min(max_n, min_length)

        for i in range(1, effective_n + 1):
            pred_ngrams = list(ngrams(predicted_tokens, i, pad_right=True, right_pad_symbol=None))
            max_ref_ngrams = {}

            for ref_tokens in reference_tokens:
                ref_ngrams = list(ngrams(ref_tokens, i, pad_right=True, right_pad_symbol=None))
                ref_ngram_counts = Counter(ref_ngrams)
                for ngram in ref_ngram_counts:
                    max_ref_ngrams[ngram] = max(max_ref_ngrams.get(ngram, 0), ref_ngram_counts[ngram])

            clipped_count = sum(min(count, max_ref_ngrams.get(ngram, 0)) for ngram, count in Counter(pred_ngrams).items())
            total_count = len(pred_ngrams)
            p_ns[i - 1] += clipped_count / total_count if total_count > 0 else 0

    # Adjust p_ns for the number of sentences processed
    p_ns = p_ns / len(sys)  # Average precision per n-gram across all sentences considered
    brevity_penalty = np.exp(1 - r / c) if c < r else 1

    # Compute BLEU score using only the effective n-grams
    bleu_score = brevity_penalty * np.exp(sum(np.log(p) for p in p_ns[:effective_n]) / effective_n)
    return bleu_score

# Example usage with your provided input
refs = [['Sì, verrete in'],
        ['Siete ferito, signore?']]
sys = ['spietate stabilirmi ',
       'Siete spietate anticamera signore ?']

# refs = [['Alice is'],['She loves UK more than states']]
# sys = ['Alice loves UK','US ']
bs = dynamic_aggregate_bleu_score(refs, sys)
print(bs)

0.1663292666623517


[nltk_data] Downloading package punkt to /Users/anantraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
