### Normalized Google Metrics

$
\large
\text{NGD} ( t_k, t_l ) = \dfrac 
    { \text{max} \big\{
        \text{log} ( f_k ), 
        \text{log} ( f_l )
    \big\}
    - \text{log} ( f_{ lk } ) }
    { \text{log} ( n ) 
    - \text{min} \big\{
        \text{log} ( f_k ), 
        \text{log} ( f_l )
    \big\} }
$

where:
- $ t_k $ and $ t_l $ are terms 
- $ f_k $ is the number of sentences containing $ t_k $
- $ f_{ kl } $ is the number of sentences containing both $ t_k $ and $ t_l $
- $ n $ is the total number of sentences

$
\large 
sim_{ \text{NGD} } ( t_k, t_l ) = \text{exp} 
    \big( - \text{NGD} ( t_k, t_l ) \big)
$

$
\large 
sim_{ \text{NGD} } ( S_i, S_j ) = 
\dfrac
    { \sum\limits
        _{ \small t_k \in S_i } 
        \sum\limits
            _{ \small t_l \in S_j } 
            sim_{ \text{NGD} } ( t_k, t_l ) }
    { m_i m_j }
$  

where:
- $ S_i $ and $ S_j $ are sentences
- $ m_i $ is the number of words in $ S_i $

In [None]:
class NormalizedGoogle:
    def __init__(self, document):
        self.sentence_words = tuple(distinct_words(sent) for sent in tokenize.sent_tokenize(document))
        
    # double check scientific paper's handling of "bad" log values
    def distance(self, term_k, term_l):
        freq_k = sum(term_k in sent for sent in self.sentence_words)
        freq_l = sum(term_l in sent for sent in self.sentence_words)
        if not (freq_k and freq_l):
            raise ValueError('terms must be in document')

        freq_kl = sum((term_k in sent) and (term_l in sent) for sent in self.sentence_words)
        if (freq_k > 0) and (freq_l > 0) and (freq_kl == 0):
            return 1.0

        logs_k_l = (math.log(freq_k), math.log(freq_l))
        n = len(self.sentence_words)

        numerator = max(logs_k_l) - math.log(freq_kl)
        denominator = math.log(n) - min(logs_k_l)
        return numerator / denominator
    
    def term_similarity(self, term_k, term_l):
        dist = self.distance(term_k, term_l)
        return math.exp(-dist)
    
    def sentence_similarity(self, sent_i, sent_j):
        total = sum(self.term_similarity(term_k, term_l)
                    for term_k, term_l in itertools.product(sent_i, sent_j))
        return total / len(sent_i) / len(sent_j)