In [256]:
import collections
import itertools
import functools
import math
# import multiprocessing
import operator
import string
# import random

import numba
import numpy as np
# import pandas as pd

from nltk import tokenize
from nltk.util import ngrams

# from scipy.spatial.distance import jaccard
# from scipy.special import expit as sigmoid

# from sklearn.metrics import jaccard_similarity_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

# Evolutionary Algorithms
- [Discrete Differential Evolution for Text Summarization](https://www.researchgate.net/publication/281662415_Discrete_Differential_Evolution_for_Text_Summarization)
- [Evolutionary Algorithm for Extractive Text Summarization](https://www.researchgate.net/profile/Ramiz_Aliguliyev/publication/220518077_Evolutionary_Algorithm_for_Extractive_Text_Summarization/links/09e4151356fc2caab6000000.pdf)
- [An Improved Evolutionary Algorithm for Extractive Text Summarization](https://link.springer.com/chapter/10.1007/978-3-642-36543-0_9)

# Similarity Measures

### Jaccard Coefficient Similarity Measure

$
\large
sim_{ jaccard } ( S_i, S_j ) = 
\dfrac
    { | S_i \cap S_j | }
    { | S_i \cup S_j | }
$

In [259]:
@numba.njit
def jaccard_similarity(a, b):
    #: assume union is non-empty since each sentence >= 1 word
    return np.sum(a & b) / np.sum(a | b)

# Objective Functions

Let $ C $ be a partition of $ D $ with $ k $ clusters.  
$ C = \{ C_1, C_2, ..., C_k \} $  

where:
- $ C_p \cap C_q = \emptyset, 
        \forall p \ne q \in \{ 1, 2, ..., k \} 
  $
- $ \bigcup\limits
    _{ p = 1 }
    ^k C_p = D 
  $
- $ C_p \ne \emptyset, 
        \forall p \in \{ 1, 2, ..., k \}
  $

### Sigmoid Function
$
\large
sigm ( x ) = 
\dfrac
    { 1 }
    { 1 + \text{exp} ( -x ) }
$

In [1]:
@numba.njit
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

NameError: name 'numba' is not defined

### Intra-Cluster Similiarity (Cohesion)

$
\large
F_1 = 
\sum\limits
    _{ \small p = 1 }
    ^{ \small k } 
\sum\limits
    _{ \small S_i, S_j \in C_p } 
\dfrac 
    { sim ( S_i, S_j ) } 
    { | C_p | } 
\rightarrow \text{max}
$

*(__Note:__ Evol Alg for Ext Txt Summ doesn't show division of $|C_p|$ but I believe this to be a typo since the paragraph detailing it says "the average sum". Additionally the Disc Diff Evol for Txt Summ shows it as such.)*

In [321]:
@numba.njit
def cohesion(chromosome, similarity, document):
    total = 0
    for p in np.unique(chromosome):
        sents = document[np.where(chromosome == p)]
        k = len(sents)
        #: combinations choose 2
        for i in range(k-1):
            for j in range(i+1, k):
                total += similarity(sents[i], sents[j]) / len(sents)  
    return total

### Inter-Cluster Dissimilarity (Separation)

$
\large
F_2 = 
\sum\limits
    _{ \small p = 1 }
    ^{ \small k - 1 }
\sum\limits
    _{ \small q = p + 1 }
    ^{ \small k } 
\sum\limits
    _{ \small S_i \in C_p } 
\sum\limits
    _{ \small S_j \in C_q } 
\dfrac 
    { sim ( S_i, S_j ) } 
    { | C_p | \cdot | C_q | }
\rightarrow \text{min}
$

In [322]:
@numba.njit
def separation(chromosome, similarity, document):
    total = 0
    k = len(np.unique(chromosome))
    #: combinations choose 2
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = document[np.where(chromosome == p)]
            sents_q = document[np.where(chromosome == q)]
            #: product
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += similarity(sents_p[i], sents_q[j]) / m / n
    return total

### Inter/Intra-Cluster Balance

$
\large
F = \big( 1 + sigm ( F_1 ) \big)
    ^{ F_2 } \rightarrow \text{max}
$

In [323]:
@numba.njit
def cohesion_separation(chromosome, similarity, document):
    coh = cohesion(chromosome, similarity, document)
    sep = separation(chromosome, similarity, document)
    return (1 + sigmoid(coh)) ** sep

## Fitness

$
\large
fitness_1 \big( X_a ( t ) \big) = F_1 \big( X_a ( t ) \big)
$

$
\large
fitness_2 \big( X_a ( t ) \big) = \dfrac{ 1 }{ F_2 ( X_a ( t ) ) }
$

$
\large
fitness \big( X_a ( t ) \big) = F \big( X_a ( t ) \big)
$

# Modified Discrete Differential Evolution Algorithm

Initialize the population with $ N $ chromosomes each composed of $ n $ random integers from \[1, k\].  

$
X_r ( t ) = [ x_{ r, 1 } ( t ), x_{ r, 2 } ( t ), ..., x_{ r, n } ( t ) ]
$  

where:
- $ x_{ r, s } ( t ) \in \{ 1, 2, ..., k \} $
- $ r = 1, 2, ..., N $
- $ s = 1, 2, ..., n $
- $ N $ is the population size
- $ n $ is the number of sentences _(in the document)_
- $ k $ is the number of clusters _(number of sentences for summary)_
- $ t $ is the iteration step

In [324]:
def init_chromosome(choices, length):
    chrom = np.full(length, -1)
    #: ensure that each choice is accounted for at least once
    idxs = np.random.choice(np.arange(length), len(choices), replace=False)
    chrom[idxs] = np.random.permutation(choices)
    idxs = np.where(chrom == -1)[0]
    chrom[idxs] = np.random.choice(choices, len(idxs))
    return chrom


def init_population(population_size, cluster_amount, chromosome_length):
    clusts = np.arange(cluster_amount)
    chroms = [init_chromosome(clusts, chromosome_length) for _ in range(population_size)]
    pop = np.vstack(chroms)
    return pop

$
\large
y_{ r, s } ( t + 1 ) = 
\begin{cases}
    x_{ r1, s } ( t ) + \lambda \big( x_{ r2, s } ( t ) - x_{ r3, s } ( t ) \big) 
        & \text{if } rand_s < \text{CR} \\
    x_{ r, s } ( t ) & \text{otherwise}
\end{cases}
$

where  
- For each $ X_r $, randomly sample $ X_{ r1 } ( t ), X_{ r2 } ( t ), X_{ r3 } ( t ) $ 
  from the same generation _(each distinct)_
- $ rand_s $ is uniformally distributed random numbers from $ [ 0, 1 ] $ 
  chosen once for each $ s \in \{ 1, 2, ..., n \} $

hyper-parameters 
- $ \lambda $ is a scale factor from $ [ 0, 1 ] $
- $ \text{CR} $ is the crossover rate from $ [ 0, 1 ] $

In [325]:
def get_offspring(population, randoms, lambda_, crossover_rate):
    #: For computation time, relax requirement that X_r, X_r1, X_r2, X_r3 are distinct. 
    #: With large population size, this is unlikely to occur, and if it does, it doesn't
    #: seem that detrimental. Also is this mitigated with appropriate lam choice?
    n = len(population)
    idxs = np.random.choice(np.arange(n), size=(n, 3))
    chrom_1, chrom_2, chrom_3 = map(np.squeeze, np.split(population[idxs], 3, axis=1))
    k = len(np.unique(population))
    offspr = (chrom_1 + lambda_ * (chrom_2 - chrom_3)) % k
    mask = randoms < crossover_rate
    offspr[mask] = population[mask]
    return offspr

$
\large
X_r(t+1) = 
\begin{cases}
    Y_r ( t + 1 ) & \text{if } f \big( Y_r ( t + 1 ) \big) > f \big( X_r ( t ) \big) \\
    X_r ( t ) & \text{otherwise}
\end{cases}
$

where 
- $ f ( \cdot ) $ is the objective function to be maximized

In [438]:
def next_generation(population, offspring, func):
    fit_off = np.array([func(chrom) for chrom in offspring])
    fit_pop = np.array([func(chrom) for chrom in population])
    mask = fit_off > fit_pop
    population[mask] = offspring[mask]
    return

## Mutation

At each iteration $ t + 1 $ for each $ X_r ( t ) $ creates
$ m_r ( t + 1 ) = [ m_{ r, 1 } ( t ), m_{ r, 2 } ( t ), ..., m_{ r, n } ( t ) ] $.  
For each gene, 1 indicates no mutation and 0 means mutate.

$
\large
m_{ r, s } ( t + 1 ) = 
\begin{cases}
    1 & \text{if } rand_s < sigm \big( y_{ r, s } ( t + 1 ) \big) \\
    0 & \text{otherwise}
\end{cases}
$

### Inversion Operator

### Figure 1 - Psuedo-code
<img src="./data/pngs/fig1_-_inversion_operator_psuedo_code.png" alt="Fig 1. Inverse operator psuedo-code" width="33%" align="left"/>

### Figure 2 - Example
<img src="./data/pngs/fig2_-_inversion_operator_diagram.png" alt="Fig 2. Inverse operator example" width="33%" align="left"/>

In [329]:
def mutate(population, randoms):
    mask = randoms < sigmoid(population)
    #: inversion operator
    idxs = np.nonzero(mask)
    arr = np.array(idxs)
    sorter = np.lexsort((-arr[1], arr[0]))
    rev = arr.T[sorter].T
    population[idxs] = population[(rev[0], rev[1])]
    return

# Extract Pertinent Sentences

In [326]:
#TODO: early stopping --> little fitness improvement over x generations, good enough fitness score
def run_iterations(pop_size, summ_len, num_sents, func, lam, cr, iterations, *, mutate_after=True,
                   seed=None, verbose=False, save_rate=np.nan, save_dir=None):
    
    if save_dir is not None:
        save_dir = pathlib.Path(save_dir)
        if not save_dir.is_dir():
            msg = f'save_dir={save_dir} not a valid directory path'.format(save_dir=save_dir)
            raise NotADirectoryError(msg)
    
    if seed is not None:
        np.random.seed(seed)
    
    pop = init_population(pop_size, summ_len, num_sents)
    shape = pop.shape
    for i in range(iterations):
        if i % save_rate == 0:
            file = save_dir / 'generation_{i:0>pad}'.format(i=i, pad=len(str(iterations)))
            np.save(file, pop)
            
        if verbose:
            print(i)  #TODO: logfile --> iteration number, best fitness score, avg fitness score, hyper-params
        
        rand = np.random.random_sample(shape)
        
        t0 = time.time()
        offspr = get_offspring(pop, rand, lam, cr)
        t1 = time.time()
        PROFILER['offspring'] += t1 - t0
        
        #: option since papers unclear if mutate at offspring or survivors stage
        if not mutate_after:
            mutate(offspr, rand)
            
        t0 = time.time()
        next_generation(pop, offspr, func)
        t1 = time.time()
        PROFILER['generation'] += t1 - t0
        
        if mutate_after:
            t0 = time.time()
            mutate(pop, rand)
            t1 = time.time()
            PROFILER['mutate'] += t1 - t0
    
    return pop

In [374]:
def best_chromosome(population):
    #TODO: make sure it picks one with all k-clusters
    fits = np.argmax([fitness(chrom) for chrom in population])
    chrom = population[fits]
    return chrom
    

def central_sentences(chromosome, document, metric=cosine_distances):
    central_sents = []
    for cluster in np.unique(chromosome):
        idxs = np.where(chromosome == cluster)[0]
        sents = document[idxs]
        centroid = sents.mean(axis=0)[np.newaxis,:]
        dists = metric(sents, centroid)
        cent_sent = idxs[np.argmin(dists)]
        central_sents.append(cent_sent)
    return sorted(central_sents)

# ROUGE-N
_(Recall Oriented Understudy for Gisting Evaluation)_

$
\large
\text{ROUGE-N} = 
\dfrac
    { \sum \limits
        _{ \small S \in Summ_{ ref }} 
        \sum \limits
        _{ \small \text{N-gram} \in S } 
            Count_{ \small match } ( \text{N-gram} ) }
    { \sum \limits
        _{ \small S \in Summ_{ ref } } 
        \sum \limits
            _{ \small \text{N-gram} \in S } 
            Count( \text{N-gram} ) }
$

In [330]:
def rouge_n(n, y_pred, y_true):
    n_gram_pred = set(ngrams(y_pred, n))
    n_gram_true = set(ngrams(y_true, n))
    return len(n_gram_pred & n_gram_true) / len(n_gram_true)

# Run Algorithm

In [394]:
import json
import pathlib

cwd = pathlib.Path.cwd()
data = cwd / 'data'
jsons = data / 'jsons'
json_2018 = jsons / '2018' / '2018.json'

with open(json_2018) as fp:
    articles_2018 = json.load(fp)['2018']

article = articles_2018[67]
text = article['story']
summ_true = article['summary']
# print(text)

In [395]:
import time

PROFILER = collections.Counter()


def fitness(chromosome):
    return cohesion_separation(chromosome, jaccard_similarity, doc)

cv = CountVectorizer(stop_words='english')
sents_lower = tokenize.sent_tokenize(text.lower())
vec = cv.fit_transform(sents_lower)
doc = vec.toarray().astype(bool).astype(int)


t0 = time.time()
pop = run_iterations(pop_size=100, summ_len=5, num_sents=len(doc), 
                     func=fitness, lam=0.9, cr=0.5, iterations=1000, verbose=False, seed=0)
t1 = time.time()
print(t1-t0)


chrom_best = best_chromosome(pop)
idxs = central_sentences(chrom_best, doc)
#: 2018 article[999] -- gay skier winter olympics -> no lower() causes wierd sentence breaks that throw off summary
# sents_norm = tokenize.sent_tokenize(text)
summ_evol = '\n'.join(np.array(sents_lower)[idxs])

PROFILER

13.156807899475098


Counter({'offspring': 0.16056060791015625,
         'generation': 12.841875314712524,
         'mutate': 0.12311697006225586})

In [396]:
print(summ_evol)

updated at 8:55 p.m. et

police in sussex, england, say they have made two arrests in the disruption of flights because of drone sightings at busy gatwick, the u.k.'s second-largest airport.
airport officials had been confident enough that the skies and runways were safe to resume service after a drone sighting on friday morning.
airfield movements were suspended while we investigated this as safety remains our main priority," airport officials said on twitter, just an hour after announcing the friday morning shutdown.
that interruption was significantly shorter than one that began wednesday night, when reports of unmanned aircraft nearby triggered a day and a half of suspended flights.
drones were first spotted in the vicinity on wednesday night, idling planes on the tarmac as officials sought to ensure the airspace was safe for takeoff.


In [397]:
def scores(n, y_pred):
    for i in range(1, n+1):
        score = rouge_n(i, y_pred, summ_true)
        print(i, score)

In [398]:
print('evolutionary')
scores(3, summ_evol)

evolutionary
1 0.8333333333333334
2 0.8347107438016529
3 0.5764705882352941


In [401]:
import gensim

summ_gensim = gensim.summarization.summarize(text)
print('gensim')
scores(3, summ_gensim)

gensim
1 0.7333333333333333
2 0.6033057851239669
3 0.3058823529411765


In [402]:
print(summ_gensim)

Airport officials had been confident enough that the skies and runways were safe to resume service after a drone sighting on Friday morning.
Airfield movements were suspended while we investigated this as safety remains our main priority," airport officials said on Twitter, just an hour after announcing the Friday morning shutdown.


In [403]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.tokenizers import Tokenizer


def format_sumy(sumy_summary):
    return ''.join(char for sent in sumy_summary for char in str(sent) if char in string.printable)

parser = PlaintextParser(text, Tokenizer('english'))

text_rank = TextRankSummarizer()
summ_text_rank = format_sumy(text_rank(parser.document, 5))
print('sumy - text rank')
scores(3, summ_text_rank)

print()

lex_rank = LexRankSummarizer()
summ_lex_rank = format_sumy(lex_rank(parser.document, 5))
print('sumy - lex rank')
scores(3, summ_lex_rank)

sumy - text rank
1 0.9666666666666667
2 0.859504132231405
3 0.5941176470588235

sumy - lex rank
1 0.8666666666666667
2 0.768595041322314
3 0.48823529411764705


In [404]:
print(summ_text_rank)

Police in Sussex, England, say they have made two arrests in the disruption of flights because of drone sightings at busy Gatwick, the U.K.'s second-largest airport.Police on Friday night said their investigations are ongoing and reiterated their call for members of the public to report anything they know about the drone operators.Airport officials had been confident enough that the skies and runways were safe to resume service after a drone sighting on Friday morning.At least one frustrated tweeter seemed to speak eloquently for the more than 100,000 people who have had their plans disrupted at the U.K.'s second-busiest airport: "Oh, come on!"Drones were first spotted in the vicinity on Wednesday night, idling planes on the tarmac as officials sought to ensure the airspace was safe for takeoff.


In [405]:
print(summ_lex_rank)

ETFlights had already resumed on Friday, after suspensions starting Wednesday night and a complete shutdown on Thursday night, leaving weary travelers longing for their holiday destinations.Airport officials had been confident enough that the skies and runways were safe to resume service after a drone sighting on Friday morning."Flights have now resumed.At least one frustrated tweeter seemed to speak eloquently for the more than 100,000 people who have had their plans disrupted at the U.K.'s second-busiest airport: "Oh, come on!"


In [406]:
print(text)

Updated at 8:55 p.m. ET

Police in Sussex, England, say they have made two arrests in the disruption of flights because of drone sightings at busy Gatwick, the U.K.'s second-largest airport.

Flights had already resumed on Friday, after suspensions starting Wednesday night and a complete shutdown on Thursday night, leaving weary travelers longing for their holiday destinations.

Police on Friday night said their investigations are ongoing and reiterated their call for members of the public to report anything they know about the drone operators.

Airport officials had been confident enough that the skies and runways were safe to resume service after a drone sighting on Friday morning. "Flights have now resumed. Airfield movements were suspended while we investigated this as safety remains our main priority," airport officials said on Twitter, just an hour after announcing the Friday morning shutdown.

That interruption was significantly shorter than one that began Wednesday night, when 