In [63]:
import collections
import contextlib
import itertools
import importlib
import functools
import io
import math
import operator
import string

import numpy as np

import nltk.tokenize

from scipy.special import expit as sigmoid
from scipy.stats import wasserstein_distance as earth_movers_dist

In [2]:
with io.StringIO() as str_io, contextlib.redirect_stdout(str_io):
    import this
    zen = str_io.getvalue()

del str_io, this

In [67]:
def distinct_words(text):
    no_punctuation = ''.join(t for t in text if t not in string.punctuation)
    return frozenset(nltk.tokenize.word_tokenize(no_punctuation))

text = zen.lower()

D = document_sentences = set(nltk.tokenize.sent_tokenize(text))
T = document_distinct_words = distinct_words(text)
S = sentence_distinct_words = {distinct_words(ds) for ds in document_sentences}

### Normalized Google Metrics

In [56]:
#: double check scientific paper's handling of bad log values
def norm_google_distance(t_k, t_l, D, S):
    """Metric for distance between two terms-- tₖ, tₗ"""
    
    f_k = sum(t_k in sent for sent in S)
    f_l = sum(t_l in sent for sent in S)
    if not (f_k and f_l):
        raise ValueError('terms must be in document')
    
    f_kl = sum((t_k in sent) and (t_l in sent) for sent in S)
    if (f_k > 0) and (f_l > 0) and (f_kl == 0):
        return 1.0
    
    log_kl = (math.log(f_k), math.log(f_l))
    n = len(D)
    
    numerator = max(log_kl) - math.log(f_kl)
    denominator = math.log(n) - min(log_kl)
    return numerator / denominator


def norm_google_similarity_term(t_k, t_l, D, S):
    """Metric for similarity between two terms-- tₖ, tₗ"""
    
    ngd = normalized_google_distance(t_k, t_l, D, S)
    return math.exp(-ngd)


def norm_google_similarity_sent(S_i, S_j, D, S):
    total = sum(sum(norm_google_similarity_term(t_k, t_l, D, S) for t_l in S_j) for t_k in S_i)
    return total / len(S_i) / len(S_j)

In [68]:
# if tₖ == tₗ --> 1
assert norm_google_similarity_term('python', 'python', D, S) == 1

# if (tₖ != tₗ) and (fₖ == fₗ == fₖₗ > 0) --> 1
assert norm_google_similarity_term('explicit', 'implicit', D, S) == 1

### Regularization Functions

In [66]:
C = k_clusters = {frozenset(), ...}
#: 1) Two different clusters should have no sentences in common
# assert all(not C_i & C_j for C_i, C_j in itertools.combinations(C, C))

#: 2) Each sentence should definitely be attached to a cluster
# assert functools.reduce(operator.or_, C) == D

#: 3) Each cluster should have at least one sentence assigned
# assert all(C_p for C_p in C)


#: want to maximize
def F(C):
    return pow(1 + sigmoid(F_1(C)), F_2(C))


#: want to maximize
def F_1(C):
    outer = 0
    for C_p in C:
        inner = 0
        for S_i, S_j in itertools.combinations(C_p):
            sim_ngd = norm_google_similarity_sent(S_i, S_i, D, S)           # D, S args
            inner += sim_ngd
        outer += len(C_p) * inner
    return outer


#: want to minimize
def F_2(C):
    C = tuple(C)    # may need to use 'ordered sets'
    k = len(C)
    
    sum_0 = 0
    for p in range(k - 1):
        sum_1 = 0
        for q in range(p + 1, k):
            sum_2 = 0
            for S_i in C[p]:
                sum_3 = 0
                for S_j in C[q]:
                    sum_3 += sum(norm_google_similarity_sent(S_i, S_j))
                sum_2 += sum_3
            sum_1 += sum_2 / len(C[q])
        sum_0 += sum_1 / len(C[p])
        


AssertionError: 

In [3]:
print(zen)

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!

