In [1]:
import itertools
import functools
import math
import operator
import string
import random

import numpy as np

from nltk import tokenize
from nltk.util import ngrams

from scipy.special import expit as sigmoid
from scipy.stats import wasserstein_distance as earth_movers_dist

In [3]:
import contextlib
import io

with io.StringIO() as str_io, contextlib.redirect_stdout(str_io):
    import this
    zen = str_io.getvalue()

del str_io, this

# Evolutionary Algorithm for Extractive Text Summarization
https://www.researchgate.net/profile/Ramiz_Aliguliyev/publication/220518077_Evolutionary_Algorithm_for_Extractive_Text_Summarization/links/09e4151356fc2caab6000000.pdf

In [4]:
def distinct_words(text):
    no_punctuation = ''.join(t for t in text if t not in string.punctuation)
    return frozenset(tokenize.word_tokenize(no_punctuation))

text = zen.lower()

D = document_sentences = set(tokenize.sent_tokenize(text))
T = document_distinct_words = distinct_words(text)
S = sentence_distinct_words = {distinct_words(ds) for ds in document_sentences}

## Normalized Google Metrics

<img src="./data/pngs/normalized_google_distance.png" alt="inverse operator psuedo-code" width="75%" align="left"/>

In [None]:
#: (2) -- double check scientific paper's handling of "bad" log values
def norm_google_distance(t_k, t_l, D, S):
    """Metric for distance between two terms-- tₖ, tₗ"""
    
    f_k = sum(t_k in sent for sent in S)
    f_l = sum(t_l in sent for sent in S)
    if not (f_k and f_l):
        raise ValueError('terms must be in document')
    
    f_kl = sum((t_k in sent) and (t_l in sent) for sent in S)
    if (f_k > 0) and (f_l > 0) and (f_kl == 0):
        return 1.0
    
    log_kl = (math.log(f_k), math.log(f_l))
    n = len(D)
    
    numerator = max(log_kl) - math.log(f_kl)
    denominator = math.log(n) - min(log_kl)
    return numerator / denominator

<img src="./data/pngs/sim_ngd_terms.png" alt="inverse operator psuedo-code" width="75%" align="left"/>

In [None]:
#: (1)
def norm_google_similarity_term(t_k, t_l, D, S):
    """Metric for similarity between two terms-- tₖ, tₗ"""
    
    ngd = normalized_google_distance(t_k, t_l, D, S)
    return math.exp(-ngd)

<img src="./data/pngs/sim_ngd_sentences.png" alt="inverse operator psuedo-code" width="75%" align="left"/>

In [112]:
#: (3)
def norm_google_similarity_sent(S_i, S_j, D, S):
    total = sum(sum(norm_google_similarity_term(t_k, t_l, D, S) for t_l in S_j) for t_k in S_i)
    return total / len(S_i) / len(S_j)

In [68]:
# if tₖ == tₗ --> 1
assert norm_google_similarity_term('python', 'python', D, S) == 1

# if (tₖ != tₗ) and (fₖ == fₗ == fₖₗ > 0) --> 1
assert norm_google_similarity_term('explicit', 'implicit', D, S) == 1

In [None]:
C = k_clusters = {frozenset(), ...}
#: 1) Two different clusters should have no sentences in common
# assert all(not C_i & C_j for C_i, C_j in itertools.combinations(C, C))

#: 2) Each sentence should definitely be attached to a cluster
# assert functools.reduce(operator.or_, C) == D

#: 3) Each cluster should have at least one sentence assigned
# assert all(C_p for C_p in C)

<img src="./data/pngs/f.png" alt="inverse operator psuedo-code" width="50%" align="left"/>

In [5]:
#: (4) -- want to maximize
def F(C):
    return pow(1 + sigmoid(F_1(C)), F_2(C))

<img src="./data/pngs/f1.png" alt="inverse operator psuedo-code" width="50%" align="left"/>

In [6]:
#: (5) -- want to maximize
def F_1(C):
    outer = 0
    for C_p in C:
        inner = 0
        for S_i, S_j in itertools.combinations(C_p):
            sim_ngd = norm_google_similarity_sent(S_i, S_i, D, S)           # D, S args
            inner += sim_ngd
        outer += len(C_p) * inner
    return outer

<img src="./data/pngs/f2.png" alt="inverse operator psuedo-code" width="50%" align="left"/>

In [110]:
#: (6) -- want to minimize
def F_2(C):
    C = tuple(C)    # may need to use 'ordered sets'; unclear how to handle p = 1 to k-1
    k = len(C)
    
    sum_0 = 0
    for p in range(k - 1):
        sum_1 = 0
        for q in range(p + 1, k):
            sum_2 = 0
            for S_i in C[p]:
                sum_3 = 0
                for S_j in C[q]:
                    sum_3 += sum(norm_google_similarity_sent(S_i, S_j))
                sum_2 += sum_3
            sum_1 += sum_2 / len(C[q])
        sum_0 += sum_1 / len(C[p])

In [None]:
seed = None  # need reproducibility option

N = len(population)
n = len(sentences)
k = len(clusters)

r = range(N)
s = range(n)

In [None]:
def X_r(t):
    return [x_r_1(t), x_r_2(t),..., x_r_n(t)]

def x_r_s(t):
    return integer in range(k)

def Y_r(t):
    return [y_r_1(t), y_r_2(t),..., y_r_n(t)]

<img src="./data/pngs/y_r_s(t+1).png" alt="inverse operator psuedo-code" width="75%" align="left"/>

In [None]:
#: (7) -- crossover_rate is CR in paper
def y_r_s(t, lam, crossover_rate):
    x_r1_s = random.choice(...)
    x_r2_s = random.choice(...)
    x_r3_s = random.choice(...)
    
    rand_s = random.random(0, 1)
    if rand_s < crossover_rate:
        return x_r1_s(t-1) + lam * (x_r2_s(t-1) - x_r3_s(t-1))
    else:
        return x_r_s(t-1)

<img src="./data/pngs/X_r(t+1).png" alt="inverse operator psuedo-code" width="75%" align="left"/>

In [None]:
#: (8)
def X_r(t, f):
    y = y_r(t-1)
    x = x_r(t-1)
    if f(y) > f(x):
        return y
    else:
        return x

## Fitness
<img src="./data/pngs/fitness.png" alt="inverse operator psuedo-code" width="75%" align="left"/>

In [111]:
#: (9)
def fitness_1(X_a):
    return F_1(X_a)

#: (10)
def fitness_2(X_a):
    return 1 / F_2(X_a)

#: (11)
def fitness(X_a):
    return F(X_a)

## Mutation
<img src="./data/pngs/m_r_s(t+1).png" alt="inverse operator psuedo-code" width="75%" align="left"/>

In [106]:
#: (12)
def m_r_s(t):
    # do I need same rand_s from (7)?
    rand_s = random.uniform(0, 1)
    sigm = sigmoid(y_r_s(t))
    return rand_s < sigm

# need mutation_rate parameter (MR in paper)

## Inversion Operator

### Figure 1 - Psuedo-code
<img src="./data/pngs/fig1_-_inversion_operator_psuedo_code.png" alt="inverse operator psuedo-code" width="50%" align="left"/>

In [103]:
def inversion_operator(X_r, m_r_p1):
    X_r_p1 = [None] * len(X_r)
    S = set()
    
    for i, (x, m) in enumerate(zip(X_r, m_r_p1)):
        if m == 1:
            X_r_p1[i] = x
        else:
            S.add(i)
    
    while len(S) > 0:
        s_min, s_max = min(S), max(S)
        X_r_p1[s_min] = X_r[s_max]
        X_r_p1[s_max] = X_r[s_min]
        S -= {s_max, s_min}
    
    return X_r_p1

### Figure 2 - Example
<img src="./data/pngs/fig2_-_inversion_operator_diagram.png" alt="inverse operator example" width="50%" align="left"/>

In [109]:
X_r = [3, 2, 4, 2, 3, 1, 4, 1]
m_r_p1 = [0, 1, 1, 0, 1, 0, 0, 1]
X_r_p1 = [4, 2, 4, 1, 3, 2, 3, 1]

assert inversion_operator(X_r, m_r_p1) == X_r_p1

## ROUGE-N
(Recall Oriented Understudy for Gisting Evaluation)

In [101]:
#: (14)
def rouge_n(n, y_pred, y_true):
    n_gram_pred = set(ngrams(y_pred, n))
    n_gram_true = set(ngrams(y_true, n))
    return len(n_gram_pred & n_gram_true) / len(n_gram_true)

In [102]:
S1 = 'police killed the gunman'
S2 = 'police kill the gunman'
S3 = 'the gunman kill police'

rouge_n(2, S2, S1)

0.8636363636363636

Preprocess Steps:  
 - stop words
 - lemmatize

In [87]:
y_true = 'a good diet must have apples and bananas'.split()
y_pred = 'apples and bananas are must for a good diet'.split()

n = 2
n_gram_pred = set(ngrams(y_pred, n))
n_gram_true = set(ngrams(y_true, n))

rouge = len(n_gram_pred & n_gram_true) / len(n_gram_true)
rouge

0.5714285714285714