In [1]:
import itertools
import functools
import multiprocessing

import numba
import numpy as np

from nltk import tokenize
from nltk.util import ngrams

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances

# Evolutionary Algorithms
- [Discrete Differential Evolution for Text Summarization](https://www.researchgate.net/publication/281662415_Discrete_Differential_Evolution_for_Text_Summarization)
- [Evolutionary Algorithm for Extractive Text Summarization](https://www.researchgate.net/profile/Ramiz_Aliguliyev/publication/220518077_Evolutionary_Algorithm_for_Extractive_Text_Summarization/links/09e4151356fc2caab6000000.pdf)
- [An Improved Evolutionary Algorithm for Extractive Text Summarization](https://link.springer.com/chapter/10.1007/978-3-642-36543-0_9)

# Similarity Measures

### Jaccard Coefficient Similarity Measure

$
\large
sim_{ jaccard } ( S_i, S_j ) = 
\dfrac
    { | S_i \cap S_j | }
    { | S_i \cup S_j | }
$

In [2]:
@numba.njit
def jaccard_similarity(a, b):
    #: assume union is non-empty since each sentence >= 1 word
    return np.sum(a & b) / np.sum(a | b)

# Objective Functions

Let $ C $ be a partition of $ D $ with $ k $ clusters.  
$ C = \{ C_1, C_2, ..., C_k \} $  

where:
- $ C_p \cap C_q = \emptyset, 
        \forall p \ne q \in \{ 1, 2, ..., k \} 
  $
- $ \bigcup\limits
    _{ p = 1 }
    ^k C_p = D 
  $
- $ C_p \ne \emptyset, 
        \forall p \in \{ 1, 2, ..., k \}
  $

### Sigmoid Function
$
\large
sigm ( x ) = 
\dfrac
    { 1 }
    { 1 + \text{exp} ( -x ) }
$

In [3]:
@numba.njit
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

### Intra-Cluster Similiarity (Cohesion)

$
\large
F_1 = 
\sum\limits
    _{ \small p = 1 }
    ^{ \small k } 
\sum\limits
    _{ \small S_i, S_j \in C_p } 
\dfrac 
    { sim ( S_i, S_j ) } 
    { | C_p | } 
\rightarrow \text{max}
$

In [4]:
@numba.njit
def cohesion(chromosome, similarity, document):
    total = 0
    for p in np.unique(chromosome):
        sents = document[np.where(chromosome == p)]
        k = len(sents)
        #: combinations(sents, r=2)
        for i in range(k-1):
            for j in range(i+1, k):
                total += similarity(sents[i], sents[j]) / len(sents)  
    return total

### Inter-Cluster Dissimilarity (Separation)

$
\large
F_2 = 
\sum\limits
    _{ \small p = 1 }
    ^{ \small k - 1 }
\sum\limits
    _{ \small q = p + 1 }
    ^{ \small k } 
\sum\limits
    _{ \small S_i \in C_p } 
\sum\limits
    _{ \small S_j \in C_q } 
\dfrac 
    { sim ( S_i, S_j ) } 
    { | C_p | \cdot | C_q | }
\rightarrow \text{min}
$

In [5]:
@numba.njit
def separation(chromosome, similarity, document):
    total = 0
    k = len(np.unique(chromosome))
    #: combinations(k, r=2)
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = document[np.where(chromosome == p)]
            sents_q = document[np.where(chromosome == q)]
            #: product(sents_p, sents_q)
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += similarity(sents_p[i], sents_q[j]) / m / n
    return total

### Inter/Intra-Cluster Balance

$
\large
F = \big( 1 + sigm ( F_1 ) \big)
    ^{ F_2 } \rightarrow \text{max}
$

In [6]:
@numba.njit
def cohesion_separation(chromosome, similarity, document):
    coh = cohesion(chromosome, similarity, document)
    sep = separation(chromosome, similarity, document)
    return (1 + sigmoid(coh)) ** sep

## Fitness

$
\large
fitness_1 \big( X_a ( t ) \big) = F_1 \big( X_a ( t ) \big)
$

$
\large
fitness_2 \big( X_a ( t ) \big) = \dfrac{ 1 }{ F_2 ( X_a ( t ) ) }
$

$
\large
fitness \big( X_a ( t ) \big) = F \big( X_a ( t ) \big)
$

# Modified Discrete Differential Evolution Algorithm

Initialize the population with $ N $ chromosomes each composed of $ n $ random integers from \[1, k\].  

$
X_r ( t ) = [ x_{ r, 1 } ( t ), x_{ r, 2 } ( t ), ..., x_{ r, n } ( t ) ]
$  

where:
- $ x_{ r, s } ( t ) \in \{ 1, 2, ..., k \} $
- $ r = 1, 2, ..., N $
- $ s = 1, 2, ..., n $
- $ N $ is the population size
- $ n $ is the number of sentences _(in the document)_
- $ k $ is the number of clusters _(number of sentences for summary)_
- $ t $ is the iteration step

In [7]:
def init_chromosome(choices, length):
    chrom = np.full(length, -1)
    #: ensure that each choice is accounted for at least once
    idxs = np.random.choice(np.arange(length), len(choices), replace=False)
    chrom[idxs] = np.random.permutation(choices)
    idxs = np.where(chrom == -1)[0]
    chrom[idxs] = np.random.choice(choices, len(idxs))
    return chrom


def init_population(population_size, cluster_amount, chromosome_length):
    clusts = np.arange(cluster_amount)
    chroms = [init_chromosome(clusts, chromosome_length) for _ in range(population_size)]
    pop = np.vstack(chroms)
    return pop

$
\large
y_{ r, s } ( t + 1 ) = 
\begin{cases}
    x_{ r1, s } ( t ) + \lambda \big( x_{ r2, s } ( t ) - x_{ r3, s } ( t ) \big) 
        & \text{if } rand_s < \text{CR} \\
    x_{ r, s } ( t ) & \text{otherwise}
\end{cases}
$

where  
- For each $ X_r $, randomly sample $ X_{ r1 } ( t ), X_{ r2 } ( t ), X_{ r3 } ( t ) $ 
  from the same generation _(each distinct)_
- $ rand_s $ is uniformally distributed random numbers from $ [ 0, 1 ] $ 
  chosen once for each $ s \in \{ 1, 2, ..., n \} $

hyper-parameters 
- $ \lambda $ is a scale factor from $ [ 0, 1 ] $
- $ \text{CR} $ is the crossover rate from $ [ 0, 1 ] $

In [8]:
def get_offspring(population, randoms, lambda_, crossover_rate):
    n = np.arange(len(population))
    s = set(n)
    idxs = np.array([np.random.choice(tuple(s - {i}), size=3, replace=False) for i in n])
    chrom_1, chrom_2, chrom_3 = map(np.squeeze, np.split(population[idxs], 3, axis=1))
    k = len(np.unique(population))
    offspr = (chrom_1 + lambda_ * (chrom_2 - chrom_3)) % k
    mask = randoms < crossover_rate
    offspr[mask] = population[mask]
    return offspr

$
\large
X_r(t+1) = 
\begin{cases}
    Y_r ( t + 1 ) & \text{if } f \big( Y_r ( t + 1 ) \big) > f \big( X_r ( t ) \big) \\
    X_r ( t ) & \text{otherwise}
\end{cases}
$

where 
- $ f ( \cdot ) $ is the objective function to be maximized

In [9]:
def next_generation(population, offspring, fitness, pool):
    fits = pool.map(fitness, itertools.chain(population, offspring))
    i = len(population)
    fit_pop = np.array(fits[:i])
    fit_off = np.array(fits[i:])
    mask = fit_off > fit_pop
    population[mask] = offspring[mask]
    return None

## Mutation

At each iteration $ t + 1 $ for each $ X_r ( t ) $ creates
$ m_r ( t + 1 ) = [ m_{ r, 1 } ( t ), m_{ r, 2 } ( t ), ..., m_{ r, n } ( t ) ] $.  
For each gene, 1 indicates no mutation and 0 means mutate.

$
\large
m_{ r, s } ( t + 1 ) = 
\begin{cases}
    1 & \text{if } rand_s < sigm \big( y_{ r, s } ( t + 1 ) \big) \\
    0 & \text{otherwise}
\end{cases}
$

### Inversion Operator

### Figure 1 - Psuedo-code
<img src="./data/pngs/fig1_-_inversion_operator_psuedo_code.png" alt="Fig 1. Inverse operator psuedo-code" width="33%" align="left"/>

### Figure 2 - Example
<img src="./data/pngs/fig2_-_inversion_operator_diagram.png" alt="Fig 2. Inverse operator example" width="33%" align="left"/>

In [10]:
def mutate(population, randoms):
    mask = randoms < sigmoid(population)
    #: inversion operator
    idxs = np.nonzero(mask)
    arr = np.array(idxs)
    sorter = np.lexsort((-arr[1], arr[0]))
    rev = arr.T[sorter].T
    population[idxs] = population[(rev[0], rev[1])]
    return

# Extract Pertinent Sentences

In [11]:
def iterate_gerations(population_size, summary_length, sentence_count, fitness,
                      lambda_, crossover_rate, iterations):
    pool = multiprocessing.Pool()
    pop = init_population(population_size, summary_length, sentence_count)
    shape = pop.shape
    for i in range(iterations):
        rand = np.random.random_sample(shape)
        offspr = get_offspring(pop, rand, lambda_, crossover_rate)
        next_generation(pop, offspr, fitness, pool)
        mutate(pop, rand)
        yield pop
    pool.terminate()
    

def best_chromosome(population):
    fits = np.argmax([fitness(chrom) for chrom in population])
    chrom = population[fits]
    return chrom
    

def central_sentences(chromosome, document, metric=cosine_distances):
    central_sents = []
    for cluster in np.unique(chromosome):
        idxs = np.where(chromosome == cluster)[0]
        sents = document[idxs]
        centroid = sents.mean(axis=0)[np.newaxis,:]
        dists = metric(sents, centroid)
        cent_sent = idxs[np.argmin(dists)]
        central_sents.append(cent_sent)
    return sorted(central_sents)

# ROUGE-N
_(Recall Oriented Understudy for Gisting Evaluation)_

$
\large
\text{ROUGE-N} = 
\dfrac
    { \sum \limits
        _{ \small S \in Summ_{ ref }} 
        \sum \limits
        _{ \small \text{N-gram} \in S } 
            Count_{ \small match } ( \text{N-gram} ) }
    { \sum \limits
        _{ \small S \in Summ_{ ref } } 
        \sum \limits
            _{ \small \text{N-gram} \in S } 
            Count( \text{N-gram} ) }
$

In [12]:
def rouge_n(n, y_pred, y_true):
    n_gram_pred = set(ngrams(y_pred, n))
    n_gram_true = set(ngrams(y_true, n))
    return len(n_gram_pred & n_gram_true) / len(n_gram_true)