# BLEU Score

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export
import sys
from os.path import join

sys.path.insert(0, '/'.join(sys.path[0].split('/')[:-1] + ['scripts']))
from gru import *

In [3]:
#simple ints experimentation
tar = [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10] # she read the book because she was interested in world history
pre = [1, 6, 7, 8, 9, 0, 5, 1, 2, 3, 4] # she was interested in world history because she read the book
c_tar = Counter(tar)
c_pre = Counter(pre)

single_gram = sum([min(c_pre[g], c_tar[g]) for g in c_pre])
print(single_gram)

10


In [4]:
#export
import numpy as np

class NGram():
    '''NGram class for preprocess texts'''
    def __init__(self, n_gram, vocab_size=5000):
        self.n_gram, self.vocab_size = n_gram, vocab_size
        
    def __eq__(self, other):
        if len(self.n_gram) != len(other.n_gram): return False
        return np.all(np.array(self.n_gram) == np.array(other.n_gram))
    
    def __hash__(self):
        return int(sum([self.n_gram[i] * self.vocab_size**i for i, o in enumerate(self.n_gram)]))
    
    def __repr__(self):
        return f'{self.n_gram}'

In [5]:
#export
def get_grams(inp, n, vocab_size=5000):
    '''Util function for grabbing multiple input NGrams of varying sizes'''
    return [NGram(inp[i:i+n], vocab_size) for i in range(len(inp)-n+1)]

In [6]:
for i in range(4):
    print(Counter(get_grams(pre, i+1)))

Counter({[1]: 2, [6]: 1, [7]: 1, [8]: 1, [9]: 1, [0]: 1, [5]: 1, [2]: 1, [3]: 1, [4]: 1})
Counter({[1, 6]: 1, [6, 7]: 1, [7, 8]: 1, [8, 9]: 1, [9, 0]: 1, [0, 5]: 1, [5, 1]: 1, [1, 2]: 1, [2, 3]: 1, [3, 4]: 1})
Counter({[1, 6, 7]: 1, [6, 7, 8]: 1, [7, 8, 9]: 1, [8, 9, 0]: 1, [9, 0, 5]: 1, [0, 5, 1]: 1, [5, 1, 2]: 1, [1, 2, 3]: 1, [2, 3, 4]: 1})
Counter({[1, 6, 7, 8]: 1, [6, 7, 8, 9]: 1, [7, 8, 9, 0]: 1, [8, 9, 0, 5]: 1, [9, 0, 5, 1]: 1, [0, 5, 1, 2]: 1, [5, 1, 2, 3]: 1, [1, 2, 3, 4]: 1})


In [7]:
#export
def get_correct_n_grams(pre, tar, n, vocab_size=5000):
    '''Compute number of matching n-grams between two sentences'''
    pre_grams = get_grams(pre, n, vocab_size)
    tar_grams = get_grams(tar, n, vocab_size)
    c_pre = Counter(pre_grams)
    c_tar = Counter(tar_grams)
    return sum([min(c_pre[g], c_tar[g]) for g in c_pre]), len(pre_grams)

In [8]:
for i in range(4):
    print(get_correct_n_grams(pre, tar, i+1, 2))

(10, 11)
(8, 10)
(5, 9)
(3, 8)


In [9]:
#export
def bleu(pre, tar, max_grams=4, vocab_size=5000):
    '''Compute BLEU score between two sentences with length penalty'''
    corrects = [get_correct_n_grams(pre, tar, n+1, vocab_size) for n in range(max_grams)]
    precision = reduce(lambda x,y: x*y, [p/l for p,l in corrects])
    len_penalty = exp(1 - len(tar)/len(pre)) if len(pre) < len(tar) else 1
    return len_penalty * precision ** 0.25

In [10]:
bleu(pre, tar)

0.6238986072117501

In [11]:
#export
def corpus_bleu_score(pres, tars, max_grams=4, vocab_size=5000):
    '''Compute BLEU score between two list of sentences (corpus) with length penalty'''
    pre_len, tar_len = 0, 0
    precisions, lengths = [0] * max_grams, [0] * max_grams
    for pre, tar in zip(pres, tars):
        pre_len += len(pre)
        tar_len += len(tar)
        for n in range(max_grams):
            precision, length = get_correct_n_grams(pre, tar, n+1, vocab_size)
            precisions[n] += precision
            lengths[n] += length
    precision = reduce(lambda x,y: x*y, [p/l for p,l in zip(precisions, lengths)])
    len_penalty = exp(1 - tar_len/pre_len) if pre_len < tar_len else 1
    return len_penalty * precision ** 0.25

In [12]:
tars = [tar] # she read the book because she was interested in world history
pres = [pre] # she was interested in world history because she read the book
bleu_score = bleu(tars[0], pres[0])
corpus_score = corpus_bleu_score(pres, tars)
test_near(torch.Tensor([bleu_score]), torch.Tensor([corpus_score]))

In [13]:
print(bleu_score)

0.6238986072117501


## Callback

In [14]:
#export
class BLEUScore(Callback):
    '''Callback to compute BLEU score for training NLP models'''
    def __init__(self, max_grams=4, vocab_size=5000):
        self.vocab_size = vocab_size
        self.max_grams = max_grams
        self.bleu_scores = []
    
    def before_epoch(self, **kwargs):
        self.pre_len, self.tar_len = 0, 0
        self.precisions, self.lengths = [0] * max_grams, [0] * max_grams    
        
    def after_batch(self, pres, tars):
        for pre, tar in zip(pres, tars):
            self.pre_len += len(pre)
            self.tar_len += len(tar)
            for n in range(max_grams):
                precision, length = get_correct_n_grams(pre, tar, n+1, vocab_size)
                self.precisions[n] += precision
                self.lengths[n] += length
    
    def after_epoch(self):
        precision = reduce(lambda x,y: x*y, [p/l for p,l in zip(self.precisions, self.lengths)])
        len_penalty = exp(1 - tar_len/pre_len) if pre_len < tar_len else 1
        bleu_score = len_penalty * precision ** 0.25
        self.bleu_scores.append(bleu_score)
        print(f'BLEU: {bleu_score}')
    
    def __repr__(self):
        return f'BLEUScore({self.max_grams})'

In [15]:
bleuScore = BLEUScore()
bleuScore

BLEUScore(4)