In [26]:
import collections
import itertools
import functools
import math
import operator
import string
import random

import numpy as np

from nltk import tokenize
from nltk.util import ngrams

from scipy.special import expit as sigmoid

In [2]:
import contextlib
import io

with io.StringIO() as str_io, contextlib.redirect_stdout(str_io):
    import this
    zen = str_io.getvalue()

del str_io, this

In [3]:
def distinct_words(text):
    no_punctuation = ''.join(t for t in text if t not in string.punctuation)
    return frozenset(tokenize.word_tokenize(no_punctuation))

text = zen.lower()
text

"the zen of python, by tim peters\n\nbeautiful is better than ugly.\nexplicit is better than implicit.\nsimple is better than complex.\ncomplex is better than complicated.\nflat is better than nested.\nsparse is better than dense.\nreadability counts.\nspecial cases aren't special enough to break the rules.\nalthough practicality beats purity.\nerrors should never pass silently.\nunless explicitly silenced.\nin the face of ambiguity, refuse the temptation to guess.\nthere should be one-- and preferably only one --obvious way to do it.\nalthough that way may not be obvious at first unless you're dutch.\nnow is better than never.\nalthough never is often better than *right* now.\nif the implementation is hard to explain, it's a bad idea.\nif the implementation is easy to explain, it may be a good idea.\nnamespaces are one honking great idea -- let's do more of those!\n"

# Evolutionary Algorithms
- [Discrete Differential Evolution for Text Summarization](https://www.researchgate.net/publication/281662415_Discrete_Differential_Evolution_for_Text_Summarization)
- [Evolutionary Algorithm for Extractive Text Summarization](https://www.researchgate.net/profile/Ramiz_Aliguliyev/publication/220518077_Evolutionary_Algorithm_for_Extractive_Text_Summarization/links/09e4151356fc2caab6000000.pdf)
- [An Improved Evolutionary Algorithm for Extractive Text Summarization](https://link.springer.com/chapter/10.1007/978-3-642-36543-0_9)

## Sentence Clustering

Let document $D$ be decomposed into a set of $n$ sentences.  
$
D = \{S_1, S_2, ..., S_n\}
$

In [4]:
D = document_sentences = set(tokenize.sent_tokenize(text))

Let terms $T$ be the set of all $m$ distinct words in D.  
$
T = \{t_1, t_2, ..., t_m\}
$

In [5]:
T = document_distinct_words = distinct_words(text)

Let $S_i$ represent the set of distinct terms in sentence $S_i$ with 
$m_i$ distinct terms.  
$
S_i = \{t_1, t_2, ..., t_{m_i}\}
$

In [6]:
S = sentence_distinct_words = {distinct_words(ds) for ds in document_sentences}

In [7]:
# seed = None  # need reproducibility option

# N = len(population)
# n = len(sentences)
# k = len(clusters)

# r = range(N)
# s = range(n)

## Similarity Measures

### Jaccard Coefficient Similarity Measure

$
\large
sim_{jaccard}(S_i, S_j) = \frac{|S_i \cap S_j|}{|S_i \cup S_j|}
$

In [8]:
def jaccard_similarity(a, b):
    a, b = set(a), set(b)
    if not a and not b:
        return 1.0
    return len(a & b) / len(a | b)

In [97]:
a = "however doctors learned that long inactivity did more harm than good".split()
b = "patients got out of shape developed blood clots and became demoralized".split()

jaccard_similarity(a, b)

0.0

### Normalized Google Metrics

$
\large
NGD(t_k, t_l) = \dfrac 
{ max\big\{log(f_k), log(f_l)\big\} - log(f_{lk}) }
{ log(n) - min\big\{log(f_k), log(f_l)\big\} }
$  

where:
- $t_k$ and $t_l$ are terms 
- $f_k$ is the number of sentences containing $t_k$
- $f_{kl}$ is the number of sentences containing both $t_k$ and $t_l$
- $n$ is the total number of sentences

$
\large 
sim_{NGD}(t_k, t_l) = e^{-NGD(t_k, t_l)}
$

$
\large 
sim_{NGD}(S_i, S_j) = \dfrac
{ \sum\limits_{ \small t_k \in S_i} \sum\limits_{ \small t_l \in S_j} sim_{NGD}(t_k, t_l) }
{ m_i m_j }
$  

where:
- $S_i$ and $S_j$ are sentences
- $m_i$ is the number of words in $S_i$

In [9]:
class NormalizedGoogle:
    def __init__(self, document):
        self.sentence_words = tuple(distinct_words(sent) for sent in tokenize.sent_tokenize(document))
        
    # double check scientific paper's handling of "bad" log values
    def distance(self, term_k, term_l):
        freq_k = sum(term_k in sent for sent in self.sentence_words)
        freq_l = sum(term_l in sent for sent in self.sentence_words)
        if not (freq_k and freq_l):
            raise ValueError('terms must be in document')

        freq_kl = sum((term_k in sent) and (term_l in sent) for sent in self.sentence_words)
        if (freq_k > 0) and (freq_l > 0) and (freq_kl == 0):
            return 1.0

        logs_k_l = (math.log(freq_k), math.log(freq_l))
        n = len(self.sentence_words)

        numerator = max(logs_k_l) - math.log(freq_kl)
        denominator = math.log(n) - min(logs_k_l)
        return numerator / denominator
    
    def term_similarity(self, term_k, term_l):
        dist = self.distance(term_k, term_l)
        return math.exp(-dist)
    
    def sentence_similarity(self, sent_i, sent_j):
        total = sum(self.term_similarity(term_k, term_l)
                    for term_k, term_l in itertools.product(sent_i, sent_j))
        return total / len(sent_i) / len(sent_j)

In [10]:
norm_google = NormalizedGoogle(text)

# if tₖ == tₗ --> 1
assert norm_google.term_similarity('better', 'better') == 1

# if (tₖ != tₗ) and (fₖ == fₗ == fₖₗ > 0) --> 1
assert norm_google.term_similarity('explicit', 'implicit') == 1

## Objective Functions

Let $C$ be a partition of D with $k$ clusters.  
$
C = \{C_1, C_2, ..., C_k\}
$  

where:
- $ C_p \cap C_q = \emptyset, \forall p \ne q \in \{1, 2, ..., k\} $
- $ \bigcup\limits_{p=1}^k C_p = D $
- $ C_p \ne \emptyset, \forall p \in \{1, 2, ..., k\} $

In [11]:
def verify_cluster(C, D):
    disjoint = all(not C_i & C_j for C_i, C_j in itertools.combinations(C, C))
    union = functools.reduce(operator.or_, C) == D
    nonempty = all(C_p for C_p in C)
    if not (disjoint and union and nonempty):
        raise ValueError('cluster is not a partition')

### Sigmoid Function
$
\large
sigm(x) = \frac{ 1 }{ 1 + e^{-x} }
$

In [12]:
from scipy.special import expit as sigmoid

### Intra-Cluster Similiarity (Cohesion)

$
\large
F_1 = 
\sum\limits_{ \small p=1}^{ \small k} 
\sum\limits_{ \small S_i, S_j \in C_p} 
\frac {sim(S_i, S_j)} {|C_p|} 
\rightarrow max
$

*(__Note:__ Evol Alg for Ext Txt Summ doesn't show division of $|C_p|$ but I believe this to be a typo since the paragraph detailing it says "the average sum". Additionally the Disc Diff Evol for Txt Summ shows it as such.)*

In [35]:
def clusterize(chromosome):
    partition = collections.defaultdict(list)
    for i, cluster in enumerate(chromosome):
        partition[cluster].append(i)
    return partition

In [55]:
sentences = np.array(['to the moon', 'moon shine is good', 'wolves howl to the moon', 'sunny day as ever', 
                      'it is windy', 'go to the store now', 'testing testing testing', 'python good js bad', 
                      'time to leave', 'the time is now', 'finally the wolves are here'])
chromosome = np.array([0, 4, 0, 2, 1, 1, 1, 3, 4, 3, 2])
clusterize(chromosome)

defaultdict(list, {0: [0, 2], 4: [1, 8], 2: [3, 10], 1: [4, 5, 6], 3: [7, 9]})

In [56]:
def cohesion(chromosome, sentences, sim):
    total = 0
    clusters = clusterize(chromosome)
    for cluster in clusters.values():
        for i, j in itertools.combinations(cluster, r=2):
            sent_i, sent_j = sentences[[i,j]]
            total += sim(sent_i, sent_j) / len(cluster)
    return total

In [57]:
cohesion(chromosome, sentences, jaccard_similarity)

1.3852079293255766

### Inter-Cluster Dissimilarity (Separation)

$
\large
F_2 = 
\sum\limits_{ \small p=1}^{ \small k-1}
\sum\limits_{ \small q=p+1}^{ \small k} 
\sum\limits_{ \small S_i \in C_p} 
\sum\limits_{ \small S_j \in C_q} 
\dfrac {sim(S_i, S_j)} {|C_p| \cdot |C_q|}
\rightarrow min
$

In [75]:
def separation(chromosome, sentences, sim):
    total = 0
    clusters = clusterize(chromosome)
    for cluster_p, cluster_q in itertools.combinations(clusters.values(), r=2):
        for i, j in itertools.product(cluster_p, cluster_q):
            sent_i, sent_j = sentences[[i,j]]
            total += sim(sent_i, sent_j) / len(cluster_p) / len(cluster_q)
    return total

In [76]:
separation(chromosome, sentences, jaccard_similarity)

4.317544526551878

### Inter/Intra-Cluster Balance

$
\large
F = \big(1 + sigm(F_1) \big)^{F_2} \rightarrow max
$

In [77]:
def cohesion_separation(chromosome, sentences, sim):
    coh = cohesion(chromosome, sentences, sim)
    sep = separation(chromosome, sentences, sim)
    return pow(1 + sigmoid(coh), sep)

In [79]:
cohesion_separation(chromosome, sentences, jaccard_similarity)

12.646468283276466

## Fitness

$
\large
fitness_1\big(X_a(t)\big) = F_1\big(X_a(t)\big)
$

$
\large
fitness_2\big(X_a(t)\big) = \dfrac{1}{F_2(X_a(t))}
$

$
\large
fitness\big(X_a(t)\big) = F\big(X_a(t)\big)
$

In [80]:
def fitness_1(chromosome, sentences, sim):
    return cohesion(chromosome, sentences, sim)

def fitness_2(chromosome, sentences, sim):
    return 1 / separation(chromosome, sentences, sim)

def fitness(chromosome, sentences, sim):
    return cohesion_separation(chromosome, sentences, sim)

# Modified Discrete Differential Evolution Algorithm

Initialize the population with $N$ chromosomes each composed of $n$ random integers from \[1, k\]. __NOTE:__ $t$ is the iteration step.

$
X_r(t) = [x_{r,1}(t), x_{r,2}(t), ... x_{r,n}(t)]
$  

where:
- $ x_{r,s}(t) \in \{1, 2, ..., k\} $
- $ r = 1, 2, ..., N $
- $ s = 1, 2, ..., n $
- $N$ is the population size
- $n$ is the number of sentences (in the document)
- $k$ is the number of clusters (number of sentences for summary)

In [17]:
#TODO: create more robust algorithm; however this is only run once for initialization
def assign_clusters(num_of_clusters, num_of_sentences):
    chromosome = []
    while not all(num in chromosome for num in range(num_of_clusters)):
        chromosome = np.random.choice(range(num_of_clusters), num_of_sentences)
    return chromosome
    

def initialize_population(population_size, num_of_clusters, num_of_sentences):
    if num_of_clusters > num_of_sentences:
        raise ValueError('num_of_clusters cannot be greater than num_of_sentences')
    population = np.array([assign_clusters(num_of_clusters, num_of_sentences) for _ in range(population_size)])
    return population

$
\large
y_{r, s}(t+1) = 
\begin{cases}
    x_{r1,s}(t) + \lambda\big(x_{r2,s}(t) - x_{r3,s}(t)\big) & \text{if}\ rand_s < CR \\
    x_{r,s}(t) & \text{otherwise}
\end{cases}
$

where  
- For each $X_r$, randomly sample $X_{r1}(t), X_{r2}(t), X_{r3}(t)$ from the same generation (each distinct)
- $rand_s$ is uniformally distributed random numbers from \[0, 1\] chosen once for each $s \in \{1, 2, ..., n\}$

hyper-parameters 
- $\lambda$ is a scale factor from \[0, 1\]
- $CR$ is the crossover rate from \[0, 1\]

In [20]:
###: __NOTES__
#: X - population
#: Y - offspring
#: M - mutation
#: X_r - chromosome
#: X_r_s - gene
#: (t) - current generation
#: (t+1) - next generation
#: f(*) - fitness function for determining whether X_r(t) or X_r(t+1) survives


#TODO: what if new chromosome does not include all clusters? do these just fall out from fitness func?
def generate_offspring(population, scaling_factor, crossover_rate):
    population_size, num_of_sentences = population.shape
    population_idxs = frozenset(range(population_size))
    num_of_clusters = np.max(population)
    offspring = population.copy()
    
    for i, chrom in enumerate(population):
        #: must select chrom != chrom_l != chrom_m != chrom_n from population
        choices = tuple(population_idxs - {i})
        idxs = np.random.choice(choices, size=3, replace=False)
        chrom_l, chrom_m, chrom_n = population[idxs]
        
        #TODO: pull out `rand` so it can be used in `mutate()`
        rand = np.random.random_sample(num_of_sentences)
        chromosomes = zip(chrom, chrom_l, chrom_m, chrom_n, rand)
        for j, (gene, gene_l, gene_m, gene_n, rand_s) in enumerate(chromosomes):
            if rand_s < crossover_rate:
                new_gene = (gene_l + scaling_factor * (gene_m - gene_n)) % num_of_clusters
                offspring[i,j] = new_gene

    return offspring

In [91]:
N = 200
n = len(sentences)
k = n // 3
λ = 0.6
cr = 0.3
criterion = functools.partial(fitness, sentences=sentences, sim=jaccard_similarity)

X = initialize_population(N, k, n)
Y = generate_offspring(X, scaling_factor=λ, crossover_rate=cr)

#: check genes in {1,2,...,k} -> % handles this
assert all(gene in range(k) for gene in Y.ravel())

#: check genes are integers -> np.array(dtype=int) handles this
assert all(gene == int(gene) for gene in Y.ravel())

$
\large
X_r(t+1) = 
\begin{cases}
    Y_r(t+1) & \text{if}\ f\big(Y_r(t+1)\big) > f\big(X_r(t)\big) \\
    X_{r}(t) & \text{otherwise}
\end{cases}
$

where 
- $f(\cdot)$ is the objective function to be maximized

In [92]:
def next_generation(population, offspring, fitness, scaling_factor, crossover_rate):
    next_gen = population.copy()
    offspring = generate_offspring(population, scaling_factor, crossover_rate)
    for i, (chrom_pop, chrom_off) in enumerate(zip(population, offspring)):
        if fitness(chrom_off) > fitness(chrom_pop):
            offspring[i] = chrom_off
    return next_gen

In [93]:
next_generation(X, Y, criterion, scaling_factor=λ, crossover_rate=cr)

array([[1, 2, 0, ..., 2, 1, 1],
       [1, 1, 0, ..., 0, 0, 1],
       [1, 2, 2, ..., 2, 1, 0],
       ...,
       [1, 2, 2, ..., 1, 1, 0],
       [0, 1, 1, ..., 1, 1, 1],
       [2, 1, 1, ..., 2, 1, 1]])

## Mutation

At each iteration $t+1$ for each $X_r(t)$ creates
$ m_r(t+1) = [m_{r,1}(t), m_{r,2}(t), ..., m_{r,n}(t)] $.  
For each gene, 1 indicates no mutation and 0 means mutate.

$
\large
m_{r,s}(t+1) = 
\begin{cases}
    1 & \text{if}\ rand_s < sigm\big(y_{r,s}(t+1)\big) \\
    0 & \text{otherwise}
\end{cases}
$

In [106]:
def mutate(Y, rand):
    M = np.empty_like(Y)
    for i, (y, rand_s) in enumerate(zip(Y.ravel(), rand.ravel())):
        M.ravel()[i] = rand_s < sigmoid(y)

mutate(Y, rand)

## Inversion Operator

### Figure 1 - Psuedo-code
<img src="./data/pngs/fig1_-_inversion_operator_psuedo_code.png" alt="Fig 1. Inverse operator psuedo-code" width="33%" align="left"/>

In [43]:
def inversion_operator(X_r, m_r_p1):
    X_r_p1 = X_r.copy()
    S = set(i for i, mutate in enumerate(m_r_p1) if not mutate)
    
    while len(S):
        s_min, s_max = min(S), max(S)
        X_r_p1[s_min] = X_r[s_max]
        X_r_p1[s_max] = X_r[s_min]
        S -= {s_max, s_min}
    
    return X_r_p1

### Figure 2 - Example
<img src="./data/pngs/fig2_-_inversion_operator_diagram.png" alt="Fig 2. Inverse operator example" width="33%" align="left"/>

In [44]:
X_r = [3, 2, 4, 2, 3, 1, 4, 1]
m_r_p1 = [0, 1, 1, 0, 1, 0, 0, 1]
X_r_p1 = [4, 2, 4, 1, 3, 2, 3, 1]

assert inversion_operator(X_r, m_r_p1) == X_r_p1

## ROUGE-N
(Recall Oriented Understudy for Gisting Evaluation)

$
\large
\text{ ROUGE-N } = \dfrac
    { \sum\limits_{ \small S \in Summ_{ref}} 
        \sum\limits_{ \small \text{N-gram} \in S} 
            Count_{ \small match}(\text{N-gram}) }
    { \sum\limits_{ \small S \in Summ_{ref}} 
        \sum\limits_{ \small \text{N-gram} \in S} 
            Count( \text{N-gram} ) }
$

In [7]:
def rouge_n(n, y_pred, y_true):
    n_gram_pred = set(ngrams(y_pred, n))
    n_gram_true = set(ngrams(y_true, n))
    return len(n_gram_pred & n_gram_true) / len(n_gram_true)

### testing rouge_n
https://rare-technologies.com/text-summarization-in-python-extractive-vs-abstractive-techniques-revisited/#how_to_evaluate_text

In [12]:
y_true = 'a good diet must have apples and bananas'.split()
y_pred = 'apples and bananas are must for a good diet'.split()

assert rouge_n(1, y_pred, y_true) == 7 / 8
assert rouge_n(2, y_pred, y_true) == 4 / 7

Preprocess Steps:  
 - stop words
 - lemmatize