In [1]:
import itertools
import functools
import math
import operator
import string
import random

import numpy as np

from nltk import tokenize
from nltk.util import ngrams

from scipy.special import expit as sigmoid
from scipy.stats import wasserstein_distance as earth_movers_dist

In [3]:
import contextlib
import io

with io.StringIO() as str_io, contextlib.redirect_stdout(str_io):
    import this
    zen = str_io.getvalue()

del str_io, this

In [None]:
def distinct_words(text):
    no_punctuation = ''.join(t for t in text if t not in string.punctuation)
    return frozenset(tokenize.word_tokenize(no_punctuation))

text = zen.lower()

# Evolutionary Algorithms
- [Evolutionary Algorithm for Extractive Text Summarization](https://www.researchgate.net/profile/Ramiz_Aliguliyev/publication/220518077_Evolutionary_Algorithm_for_Extractive_Text_Summarization/links/09e4151356fc2caab6000000.pdf)
- [An Improved Evolutionary Algorithm for Extractive Text Summarization](https://link.springer.com/chapter/10.1007/978-3-642-36543-0_9)

## Sentence Clustering

Let document $D$ be decomposed into a set of $n$ sentences.  
$
D = \{S_1, S_2, ..., S_n\}
$

In [None]:
D = document_sentences = set(tokenize.sent_tokenize(text))

Let terms $T$ be the set of all $m$ distinct words in D.  
$
T = \{t_1, t_2, ..., t_m\}
$

In [4]:
T = document_distinct_words = distinct_words(text)

Let $S_i$ represent the set of distinct terms in sentence $S_i$ with 
$m_i$ distinct terms.  
$
S_i = \{t_1, t_2, ..., t_{m_i}\}
$

In [None]:
S = sentence_distinct_words = {distinct_words(ds) for ds in document_sentences}

## Similarity Measures

### Jaccard Coefficient Similarity Measure

$
\large
sim_{jaccard}(S_i, S_j) = \frac{|S_i \cap S_j|}{|S_i \cup S_j|}
$

In [48]:
def jaccard_similarity(a, b):
    a, b = set(a), set(b)
    if not a and not b:
        return 1.0
    return len(a & b) / len(a | b)

### Normalized Google Metrics

$
\large
NGD(t_k, t_l) = \frac{max\{log(f_k), log(f_l)\} - log(f_{lk})} {log(n) - min\{log(f_k), log(f_l)\}}
$  

where:
- $t_k$ and $t_l$ are terms 
- $f_k$ is the number of sentences containing $t_k$
- $f_{kl}$ is the number of sentences containing both $t_k$ and $t_l$
- $n$ is the total number of sentences

In [None]:
# double check scientific paper's handling of "bad" log values
def norm_google_distance(t_k, t_l, D, S):
    """Metric for distance between two terms-- tₖ, tₗ"""
    
    f_k = sum(t_k in sent for sent in S)
    f_l = sum(t_l in sent for sent in S)
    if not (f_k and f_l):
        raise ValueError('terms must be in document')
    
    f_kl = sum((t_k in sent) and (t_l in sent) for sent in S)
    if (f_k > 0) and (f_l > 0) and (f_kl == 0):
        return 1.0
    
    log_kl = (math.log(f_k), math.log(f_l))
    n = len(D)
    
    numerator = max(log_kl) - math.log(f_kl)
    denominator = math.log(n) - min(log_kl)
    return numerator / denominator

$
\large 
sim_{NGD}(t_k, t_l) = e^{-NGD(t_k, t_l)}
$

In [None]:
#: (1)
def norm_google_similarity_term(t_k, t_l, D, S):
    """Metric for similarity between two terms-- tₖ, tₗ"""
    
    ngd = normalized_google_distance(t_k, t_l, D, S)
    return math.exp(-ngd)

$
\large 
sim_{NGD}(S_i, S_j) = \frac{ \sum\limits_{t_k \in S_i} \sum\limits_{t_l \in S_j} sim_{NGD}(t_k, t_l) }{ m_i m_j }
$  

where:
- $S_i$ and $S_j$ are sentences
- $m_i$ is the number of words in $S_i$

In [112]:
#: (3)
def norm_google_similarity_sent(S_i, S_j, D, S):
    total = sum(sum(norm_google_similarity_term(t_k, t_l, D, S) for t_l in S_j) for t_k in S_i)
    return total / len(S_i) / len(S_j)

In [68]:
# if tₖ == tₗ --> 1
assert norm_google_similarity_term('python', 'python', D, S) == 1

# if (tₖ != tₗ) and (fₖ == fₗ == fₖₗ > 0) --> 1
assert norm_google_similarity_term('explicit', 'implicit', D, S) == 1

## Objective Functions

Let $C$ be a partition of D with $k$ clusters.  
$
C = \{C_1, C_2, ..., C_k\}
$

In [None]:
C = k_clusters = {frozenset(), ...}
#: 1) Two different clusters should have no sentences in common
# assert all(not C_i & C_j for C_i, C_j in itertools.combinations(C, C))

#: 2) Each sentence should definitely be attached to a cluster
# assert functools.reduce(operator.or_, C) == D

#: 3) Each cluster should have at least one sentence assigned
# assert all(C_p for C_p in C)

### Sigmoid Function
$
\large
sigm(x) = \frac{ 1 }{ 1 + e^{-x} }
$

In [46]:
from scipy.special import expit as sigmoid

### Criterion Function
$
\large
F = (1 + sigm(F_1))^{F_2} \rightarrow max
$

In [49]:
def criterion_function(C):
    return pow(1 + sigmoid(F_1(C)), F_2(C))

### Intra-Cluster Similiarity
$
\large
F_1 = \sum\limits_{p=1}^{k} |C_p| \sum\limits_{S_i, S_j \in C_p} sim(S_i, S_j) \rightarrow max
$

In [6]:
def intra_cluster_imilarity(C):
    outer = 0
    for C_p in C:
        inner = 0
        for S_i, S_j in itertools.combinations(C_p):
            sim_ngd = norm_google_similarity_sent(S_i, S_i, D, S)           # D, S args
            inner += sim_ngd
        outer += len(C_p) * inner
    return outer

### Inter-Cluster Dissimilarity
$
\large
F_2 = \sum\limits_{p=1}^{k-1} \frac{1}{|C_p|} \sum\limits_{q=p+1}^{k} \sum\limits_{S_i \in C_p} \sum\limits_{S_l \in C_q} sim(S_i, S_l) \rightarrow min
$

In [110]:
def inter_cluster_dissimilarity(C):
    C = tuple(C)    # may need to use 'ordered sets'; unclear how to handle p = 1 to k-1
    k = len(C)
    
    sum_0 = 0
    for p in range(k - 1):
        sum_1 = 0
        for q in range(p + 1, k):
            sum_2 = 0
            for S_i in C[p]:
                sum_3 = 0
                for S_j in C[q]:
                    sum_3 += sum(norm_google_similarity_sent(S_i, S_j))
                sum_2 += sum_3
            sum_1 += sum_2 / len(C[q])
        sum_0 += sum_1 / len(C[p])

In [None]:
seed = None  # need reproducibility option

N = len(population)
n = len(sentences)
k = len(clusters)

r = range(N)
s = range(n)

# Modified Discrete Differential Evolution Algorithm

Initialize the population with $N$ chromosomes each composed of $n$ random integers from \[1, k\]. __NOTE:__ $t$ is the iteration step.

$
X_r(t) = [x_{r,1}(t), x_{r,2}(t), ... x_{r,n}(t)]
$  

where:
- $ x_{r,s}(t) \in \{1, 2, ..., k\} $
- $ r = 1, 2, ..., N $
- $ s = 1, 2, ..., n $
- $N$ is the population size
- $k$ is the number of clusters

In [None]:
def X_r(t):
    return [x_r_1(t), x_r_2(t),..., x_r_n(t)]

def x_r_s(t):
    return integer in range(k)

def Y_r(t):
    return [y_r_1(t), y_r_2(t),..., y_r_n(t)]

$
\large
y_{r, s}(t+1) = 
\begin{cases}
    x_{r1,s}(t) + \lambda[x_{r2,s}(t) - x_{r3,s}(t)] & \text{if}\ rand_s < CR \\
    x_{r,s}(t) & \text{otherwise}
\end{cases}
$

where  
- For each $X_r$, randomly sample $X_{r1}(t), X_{r2}(t), X_{r3}(t)$ from the same generation (each distinct)
- $rand_s$ is uniformally distributed random numbers from \[0, 1\] chosen once for each $s \in \{1, 2, ..., n\}$

hyper-parameters 
- $\lambda$ is a scale factor from \[0, 1\]
- $CR$ is the crossover rate from \[0, 1\]

In [None]:
def y_r_s(t, lam, crossover_rate):
    x_r1_s = random.choice(...)
    x_r2_s = random.choice(...)
    x_r3_s = random.choice(...)
    
    rand_s = random.random(0, 1)
    if rand_s < crossover_rate:
        return x_r1_s(t-1) + lam * (x_r2_s(t-1) - x_r3_s(t-1))
    else:
        return x_r_s(t-1)

$
\large
X_r(t+1) = 
\begin{cases}
    Y_r(t+1) & \text{if}\ f(Y_r(t+1)) > f(X_r(t)) \\
    x_{r}(t) & \text{otherwise}
\end{cases}
$

where 
- $f(\cdot)$ is the objective function to be maximized

In [None]:
def X_r(t, f):
    y = y_r(t-1)
    x = x_r(t-1)
    if f(y) > f(x):
        return y
    else:
        return x

## Fitness
<img src="./data/pngs/fitness.png" alt="inverse operator psuedo-code" width="33%" align="left"/>

In [111]:
#TODO: rename fitness funcs to their corresponding objective function

#: (9)
def fitness_1(X_a):
    return F_1(X_a)

#: (10)
def fitness_2(X_a):
    return 1 / F_2(X_a)

#: (11)
def fitness(X_a):
    return F(X_a)

## Mutation

$
\large
m_{r,s}(t+1) = 
\begin{cases}
    1 & \text{if}\ rand_s < sigm(y_{r,s}(t+1)) \\
    0 & \text{otherwise}
\end{cases}
$

In [106]:
def m_r_s(t):
    # do I need same rand_s from (7)?
    rand_s = random.uniform(0, 1)
    sigm = sigmoid(y_r_s(t))
    return rand_s < sigm

# need mutation_rate parameter (MR in paper)

## Inversion Operator

### Figure 1 - Psuedo-code
<img src="./data/pngs/fig1_-_inversion_operator_psuedo_code.png" alt="inverse operator psuedo-code" width="33%" align="left"/>

In [43]:
def inversion_operator(X_r, m_r_p1):
    X_r_p1 = X_r.copy()
    S = set(i for i, mutate in enumerate(m_r_p1) if not mutate)
    
    while len(S):
        s_min, s_max = min(S), max(S)
        X_r_p1[s_min] = X_r[s_max]
        X_r_p1[s_max] = X_r[s_min]
        S -= {s_max, s_min}
    
    return X_r_p1

### Figure 2 - Example
<img src="./data/pngs/fig2_-_inversion_operator_diagram.png" alt="inverse operator example" width="33%" align="left"/>

In [44]:
X_r = [3, 2, 4, 2, 3, 1, 4, 1]
m_r_p1 = [0, 1, 1, 0, 1, 0, 0, 1]
X_r_p1 = [4, 2, 4, 1, 3, 2, 3, 1]

assert inversion_operator(X_r, m_r_p1) == X_r_p1

## ROUGE-N
(Recall Oriented Understudy for Gisting Evaluation)  
<img src="./data/pngs/rouge-n.png" alt="inverse operator example" width="33%" align="left"/>

In [7]:
#: (14)
def rouge_n(n, y_pred, y_true):
    n_gram_pred = set(ngrams(y_pred, n))
    n_gram_true = set(ngrams(y_true, n))
    return len(n_gram_pred & n_gram_true) / len(n_gram_true)

### testing rouge_n
https://rare-technologies.com/text-summarization-in-python-extractive-vs-abstractive-techniques-revisited/#how_to_evaluate_text

In [12]:
y_true = 'a good diet must have apples and bananas'.split()
y_pred = 'apples and bananas are must for a good diet'.split()

assert rouge_n(1, y_pred, y_true) == 7 / 8
assert rouge_n(2, y_pred, y_true) == 4 / 7

Preprocess Steps:  
 - stop words
 - lemmatize