In [108]:
import collections
import itertools
import functools
import math
import operator
import string
import random

import numpy as np
import pandas as pd

from nltk import tokenize
from nltk.util import ngrams

from scipy.spatial.distance import jaccard
from scipy.special import expit as sigmoid

from sklearn.metrics import jaccard_similarity_score
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import json
import pathlib

cwd = pathlib.Path.cwd()
data = cwd / 'data'
jsons = data / 'jsons'
json_2018 = jsons / '2018'

json_2018 = list(json_2018.iterdir())[0]

In [93]:
with open(json_2018) as fp:
    articles_2018 = json.load(fp)['2018']

article = articles_2018[4]
text = article['story'].lower()
# print(text)

# Evolutionary Algorithms
- [Discrete Differential Evolution for Text Summarization](https://www.researchgate.net/publication/281662415_Discrete_Differential_Evolution_for_Text_Summarization)
- [Evolutionary Algorithm for Extractive Text Summarization](https://www.researchgate.net/profile/Ramiz_Aliguliyev/publication/220518077_Evolutionary_Algorithm_for_Extractive_Text_Summarization/links/09e4151356fc2caab6000000.pdf)
- [An Improved Evolutionary Algorithm for Extractive Text Summarization](https://link.springer.com/chapter/10.1007/978-3-642-36543-0_9)

## Sentence Clustering

In [4]:
def distinct_words(text):
    no_punctuation = ''.join(t for t in text if t not in string.punctuation)
    return frozenset(tokenize.word_tokenize(no_punctuation))

Let document $ D $ be decomposed into a set of $ n $ sentences.  
$
D = \{ S_1, S_2, ..., S_n \}
$

In [98]:
D = document_sentences = set(tokenize.sent_tokenize(text))

In [133]:
sents_words_df = pd.DataFrame(sents_words_arr, columns=cv.get_feature_names())

Let terms $ T $ be the set of all $ m $ distinct words in $ D $.  
$
T = \{ t_1, t_2, ..., t_m \}
$

In [6]:
T = document_distinct_words = distinct_words(text)

Let $ S_i $ represent the set of distinct terms in sentence $ S_i $ with 
$ m_i $ distinct terms.  
$
S_i = \{ t_1, t_2, ..., t_{ m_i } \}
$

In [7]:
S = sentence_distinct_words = {distinct_words(ds) for ds in document_sentences}

## Similarity Measures

### Jaccard Coefficient Similarity Measure

$
\large
sim_{ jaccard } ( S_i, S_j ) = 
\dfrac
    { | S_i \cap S_j | }
    { | S_i \cup S_j | }
$

### Normalized Google Metrics

$
\large
\text{NGD} ( t_k, t_l ) = \dfrac 
    { \text{max} \big\{
        \text{log} ( f_k ), 
        \text{log} ( f_l )
    \big\}
    - \text{log} ( f_{ lk } ) }
    { \text{log} ( n ) 
    - \text{min} \big\{
        \text{log} ( f_k ), 
        \text{log} ( f_l )
    \big\} }
$

where:
- $ t_k $ and $ t_l $ are terms 
- $ f_k $ is the number of sentences containing $ t_k $
- $ f_{ kl } $ is the number of sentences containing both $ t_k $ and $ t_l $
- $ n $ is the total number of sentences

$
\large 
sim_{ \text{NGD} } ( t_k, t_l ) = \text{exp} 
    \big( - \text{NGD} ( t_k, t_l ) \big)
$

$
\large 
sim_{ \text{NGD} } ( S_i, S_j ) = 
\dfrac
    { \sum\limits
        _{ \small t_k \in S_i } 
        \sum\limits
            _{ \small t_l \in S_j } 
            sim_{ \text{NGD} } ( t_k, t_l ) }
    { m_i m_j }
$  

where:
- $ S_i $ and $ S_j $ are sentences
- $ m_i $ is the number of words in $ S_i $

In [10]:
class NormalizedGoogle:
    def __init__(self, document):
        self.sentence_words = tuple(distinct_words(sent) for sent in tokenize.sent_tokenize(document))
        
    # double check scientific paper's handling of "bad" log values
    def distance(self, term_k, term_l):
        freq_k = sum(term_k in sent for sent in self.sentence_words)
        freq_l = sum(term_l in sent for sent in self.sentence_words)
        if not (freq_k and freq_l):
            raise ValueError('terms must be in document')

        freq_kl = sum((term_k in sent) and (term_l in sent) for sent in self.sentence_words)
        if (freq_k > 0) and (freq_l > 0) and (freq_kl == 0):
            return 1.0

        logs_k_l = (math.log(freq_k), math.log(freq_l))
        n = len(self.sentence_words)

        numerator = max(logs_k_l) - math.log(freq_kl)
        denominator = math.log(n) - min(logs_k_l)
        return numerator / denominator
    
    def term_similarity(self, term_k, term_l):
        dist = self.distance(term_k, term_l)
        return math.exp(-dist)
    
    def sentence_similarity(self, sent_i, sent_j):
        total = sum(self.term_similarity(term_k, term_l)
                    for term_k, term_l in itertools.product(sent_i, sent_j))
        return total / len(sent_i) / len(sent_j)

## Objective Functions

Let $ C $ be a partition of $ D $ with $ k $ clusters.  
$ C = \{ C_1, C_2, ..., C_k \} $  

where:
- $ C_p \cap C_q = \emptyset, 
        \forall p \ne q \in \{ 1, 2, ..., k \} 
  $
- $ \bigcup\limits
    _{ p = 1 }
    ^k C_p = D 
  $
- $ C_p \ne \emptyset, 
        \forall p \in \{ 1, 2, ..., k \}
  $

In [12]:
def verify_clusters(clusters, document):
    disjoint = all(not cluster_i & cluster_j for cluster_i, cluster_j in itertools.combinations(clusters, r=2))
    union = functools.reduce(operator.or_, clusters) == document
    nonempty = all(cluster for cluster in clusters)
    if not (disjoint and union and nonempty):
        raise ValueError('clusters do not form a partition')

In [13]:
def clusterize(chromosome):
    partition = collections.defaultdict(list)
    for i, cluster in enumerate(chromosome):
        partition[cluster].append(i)
    return partition

### Sigmoid Function
$
\large
sigm ( x ) = 
\dfrac
    { 1 }
    { 1 + \text{exp} ( -x ) }
$

In [15]:
from scipy.special import expit as sigmoid
# interesting this is 3x faster than my implimentation

### Intra-Cluster Similiarity (Cohesion)

$
\large
F_1 = 
\sum\limits
    _{ \small p = 1 }
    ^{ \small k } 
\sum\limits
    _{ \small S_i, S_j \in C_p } 
\dfrac 
    { sim ( S_i, S_j ) } 
    { | C_p | } 
\rightarrow \text{max}
$

*(__Note:__ Evol Alg for Ext Txt Summ doesn't show division of $|C_p|$ but I believe this to be a typo since the paragraph detailing it says "the average sum". Additionally the Disc Diff Evol for Txt Summ shows it as such.)*

### Inter-Cluster Dissimilarity (Separation)

$
\large
F_2 = 
\sum\limits
    _{ \small p = 1 }
    ^{ \small k - 1 }
\sum\limits
    _{ \small q = p + 1 }
    ^{ \small k } 
\sum\limits
    _{ \small S_i \in C_p } 
\sum\limits
    _{ \small S_j \in C_q } 
\dfrac 
    { sim ( S_i, S_j ) } 
    { | C_p | \cdot | C_q | }
\rightarrow \text{min}
$

In [274]:
import numba
from sklearn.metrics import jaccard_similarity_score


cv = CountVectorizer()
vec = cv.fit_transform(tokenize.sent_tokenize(text))
#: convert all nonzero counts to 1;just need as like a giant set
sents_words_arr = vec.toarray().astype(bool).astype(int)


jaccard_similarity_score(sents_words_arr[0], sents_words_arr[1])  #: works


chrom = np.array([0, 2, 1, 1, 3, 2, 4, 1])


def cohesion(chromosome, sim):
    total = 0
    for p in np.unique(chromosome):
        cluster_p = sents_words_arr[np.where(chromosome == p)]   #: using global sent_words_arr
        for sent_i, sent_j in itertools.combinations(cluster_p, r=2):
            total += sim(sent_i, sent_j) / len(cluster_p)
    return total


def separation(chromosome, sim):
    total = 0
    for p, q in itertools.combinations(np.unique(chromosome), r=2):
        cluster_p = sents_words_arr[np.where(chromosome == p)]   #: using global sent_words_arr
        cluster_q = sents_words_arr[np.where(chromosome == q)]   #: using global sent_words_arr
        for sent_i, sent_j in itertools.product(cluster_p, cluster_q):
            total += sim(sent_i, sent_j) / len(cluster_p) / len(cluster_q)
    return total

def cohesion_separation(chromosome, sim):
    coh = cohesion(chromosome, sim)
    sep = separation(chromosome, sim)
    return pow(1 + sigmoid(coh), sep)

In [328]:
chrom = np.array([0, 2, 1, 1, 3, 2, 4, 1])

@numba.jit(nopython=True)
def combinations(a):
    k = len(a)
    for i in range(k-1):
        for j in range(i+1, k):
            yield a[i], a[j]

@numba.jit(nopython=True)
def jaccard_sim(a, b):
    #: assume union is non-empty since each sentence >= 1 word
    return np.sum(a & b) / np.sum(a | b)

@numba.jit(nopython=True)
def cohesion2(chromosome, sim):
    total = 0
    for p in np.unique(chromosome):
        cluster_p = sents_words_arr[np.where(chromosome == p)]   #: using global sent_words_arr
        #: cannot do itertools.combinations with numba
        for sent_i, sent_j in combinations(cluster_p):
            total += sim(sent_i, sent_j) / len(cluster_p)
    return total

@numba.jit(nopython=True)
def test():
    yield 1

cohesion2(chrom, jaccard_sim)
# jaccard_sim(a[0], b[0])
# list(combinations('abcd')) == list(itertools.combinations('abcd', r=2))

0.1848953118802626

In [324]:
%timeit list(combinations('abcdefghijklmn'))

37.7 µs ± 316 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [325]:
%timeit list(itertools.combinations('abcdefghijklmn', r=2))

3.46 µs ± 13.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [309]:
%timeit jaccard_sim(a[0], b[0])

1.19 µs ± 4.18 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [310]:
%timeit jaccard_similarity_score(a[0], b[0])

89.7 µs ± 630 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [277]:
from numba import jit
import numpy as np

x = np.arange(100).reshape(10, 10)

# @jit(nopython=True) # Set "nopython" mode for best performance, equivalent to @njit
@numba.njit()
def go_fast(a): # Function is compiled to machine code when called the first time
    trace = 0
    sents_words_arr
    for i in range(a.shape[0]):   # Numba likes loops
        trace += np.tanh(a[i, i]) # Numba likes NumPy functions
    return a + trace              # Numba likes NumPy broadcasting

print(go_fast(x))

None


### Inter/Intra-Cluster Balance

$
\large
F = \big( 1 + sigm ( F_1 ) \big)
    ^{ F_2 } \rightarrow \text{max}
$

In [20]:
def cohesion_separation(chromosome, sentences, sim):
    coh = cohesion(chromosome, sentences, sim)
    sep = separation(chromosome, sentences, sim)
    return pow(1 + sigmoid(coh), sep)

## Fitness

$
\large
fitness_1 \big( X_a ( t ) \big) = F_1 \big( X_a ( t ) \big)
$

$
\large
fitness_2 \big( X_a ( t ) \big) = \dfrac{ 1 }{ F_2 ( X_a ( t ) ) }
$

$
\large
fitness \big( X_a ( t ) \big) = F \big( X_a ( t ) \big)
$

In [22]:
def fitness_1(chromosome, sentences, sim):
    return cohesion(chromosome, sentences, sim)

def fitness_2(chromosome, sentences, sim):
    return 1 / separation(chromosome, sentences, sim)

def fitness(chromosome, sentences, sim):
    return cohesion_separation(chromosome, sentences, sim)

# Modified Discrete Differential Evolution Algorithm

Initialize the population with $ N $ chromosomes each composed of $ n $ random integers from \[1, k\].  

$
X_r ( t ) = [ x_{ r, 1 } ( t ), x_{ r, 2 } ( t ), ..., x_{ r, n } ( t ) ]
$  

where:
- $ x_{ r, s } ( t ) \in \{ 1, 2, ..., k \} $
- $ r = 1, 2, ..., N $
- $ s = 1, 2, ..., n $
- $ N $ is the population size
- $ n $ is the number of sentences _(in the document)_
- $ k $ is the number of clusters _(number of sentences for summary)_
- $ t $ is the iteration step

$
\large
y_{ r, s } ( t + 1 ) = 
\begin{cases}
    x_{ r1, s } ( t ) + \lambda \big( x_{ r2, s } ( t ) - x_{ r3, s } ( t ) \big) 
        & \text{if } rand_s < \text{CR} \\
    x_{ r, s } ( t ) & \text{otherwise}
\end{cases}
$

where  
- For each $ X_r $, randomly sample $ X_{ r1 } ( t ), X_{ r2 } ( t ), X_{ r3 } ( t ) $ 
  from the same generation _(each distinct)_
- $ rand_s $ is uniformally distributed random numbers from $ [ 0, 1 ] $ 
  chosen once for each $ s \in \{ 1, 2, ..., n \} $

hyper-parameters 
- $ \lambda $ is a scale factor from $ [ 0, 1 ] $
- $ \text{CR} $ is the crossover rate from $ [ 0, 1 ] $

In [90]:
# def jaccard_similarity(a, b):
#     intersection = np.intersect1d(a, b)
#     union = np.union1d(a, b)
#     try:
#         return len(intersection) / len(union)
#     except ZeroDivisionError:
#         return 1

#TODO: vectorize this entire cell if possible and remove np.vectorizer(func) from next_generation
def jaccard_similarity(a, b):
    a, b = set(a), set(b)
    try:
        return len(a & b) / len(a | b)
    except ZeroDivisionError:
        return 1.0


def clusterize(chromosome):
    partition = collections.defaultdict(list)
    for i, cluster in enumerate(chromosome):
        partition[cluster].append(i)
    return partition


def cohesion(chromosome, sentences, sim):
    total = 0
    clusters = clusterize(chromosome)
    for cluster in clusters.values():
        for i, j in itertools.combinations(cluster, r=2):
            sent_i, sent_j = sentences[[i,j]]
            total += sim(sent_i, sent_j) / len(cluster)
    return total


def separation(chromosome, sentences, sim):
    total = 0
    clusters = clusterize(chromosome)
    for cluster_p, cluster_q in itertools.combinations(clusters.values(), r=2):
        for i, j in itertools.product(cluster_p, cluster_q):
            sent_i, sent_j = sentences[[i,j]]
            total += sim(sent_i, sent_j) / len(cluster_p) / len(cluster_q)
    return total


def cohesion_separation(chromosome, sentences, sim):
    coh = cohesion(chromosome, sentences, sim)
    sep = separation(chromosome, sentences, sim)
    return pow(1 + sigmoid(coh), sep)

In [91]:
def sigmoid(x):
    return 1 / (1 + np.exp(x))


def get_chromosome(choices, length):
    chrom = np.full(length, -1)
    #: ensure that each choice is accounted for at least once
    idxs = np.random.choice(np.arange(length), len(choices), replace=False)
    chrom[idxs] = np.random.permutation(choices)
    idxs = np.where(chrom == -1)[0]
    chrom[idxs] = np.random.choice(choices, len(idxs))
    return chrom


def init_population(pop_size, clust_amt, chrom_len):
    clusts = np.arange(clust_amt)
    chroms = [get_chromosome(clusts, chrom_len) for _ in range(pop_size)]
    pop = np.vstack(chroms)
    return pop
    

def get_offspring(pop, rnd, lam, cr):
    #: For computation time, relax requirement that X_r, X_r1, X_r2, X_r3 are distinct. 
    #: With large population size, this is unlikely to occur, and if it does, it doesn't
    #: seem that detrimental. Also is this mitigated with appropriate lam choice?
    n = len(pop)
    idxs = np.random.choice(np.arange(n), size=(n, 3))
    chrom_1, chrom_2, chrom_3 = map(np.squeeze, np.split(pop[idxs], 3, axis=1))
    offspr = chrom_1 + lam * (chrom_2 - chrom_3)
    mask = rnd < cr
    offspr[mask] = pop[mask]
    return offspr


def next_generation(pop, offspr, func):
    fit_off = np.array(list(map(func, offspr)))
    fit_pop = np.array(list(map(func, pop)))
    mask = fit_off > fit_pop
    pop[mask] = offspr[mask]
    return


def mutate(pop, rnd):
    mask = rnd < sigmoid(pop)
    idxs = np.nonzero(mask)
    rev = np.vstack(idxs).T
    rev = np.array(sorted(rev, key=lambda x: (x[0], -x[1])))
    rev = tuple(np.split(rev.T, 2))
    pop[idxs] = pop[rev]
    return


def run_iterations(pop_size, summ_len, num_sents, func, lam, cr, iterations):
    pop = init_population(pop_size, summ_len, num_sents)
    shape = pop.shape
    for i in range(iterations):
        print(i)
        rnd = np.random.random_sample(shape)
        offspr = get_offspring(pop, rnd, lam, cr)
        next_generation(pop, offspr, func)
        mutate(pop, rnd)
    return pop

In [92]:
scaling_factor = 0.9
crossover_rate = 0.5
criterion = functools.partial(cohesion_separation, sentences=np.array(list(document_sentences)), sim=jaccard_similarity)


run_iterations(pop_size=100, summ_len=5, num_sents=len(document_sentences), 
               func=criterion, lam=0.9, cr=0.5, iterations=1000)

# population = init_population(pop_size=100, summ_len=5, num_sents=len(document_sentences))
# evolved = evolve_generations(population, criterion, scaling_factor, crossover_rate, iterations=1000)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56


KeyboardInterrupt: 

$
\large
X_r(t+1) = 
\begin{cases}
    Y_r ( t + 1 ) & \text{if } f \big( Y_r ( t + 1 ) \big) > f \big( X_r ( t ) \big) \\
    X_r ( t ) & \text{otherwise}
\end{cases}
$

where 
- $ f ( \cdot ) $ is the objective function to be maximized

## Mutation

At each iteration $ t + 1 $ for each $ X_r ( t ) $ creates
$ m_r ( t + 1 ) = [ m_{ r, 1 } ( t ), m_{ r, 2 } ( t ), ..., m_{ r, n } ( t ) ] $.  
For each gene, 1 indicates no mutation and 0 means mutate.

$
\large
m_{ r, s } ( t + 1 ) = 
\begin{cases}
    1 & \text{if } rand_s < sigm \big( y_{ r, s } ( t + 1 ) \big) \\
    0 & \text{otherwise}
\end{cases}
$

## Inversion Operator

### Figure 1 - Psuedo-code
<img src="./data/pngs/fig1_-_inversion_operator_psuedo_code.png" alt="Fig 1. Inverse operator psuedo-code" width="33%" align="left"/>

### Figure 2 - Example
<img src="./data/pngs/fig2_-_inversion_operator_diagram.png" alt="Fig 2. Inverse operator example" width="33%" align="left"/>

## Run Iterations

## ROUGE-N
_(Recall Oriented Understudy for Gisting Evaluation)_

$
\large
\text{ROUGE-N} = 
\dfrac
    { \sum \limits
        _{ \small S \in Summ_{ ref }} 
        \sum \limits
        _{ \small \text{N-gram} \in S } 
            Count_{ \small match } ( \text{N-gram} ) }
    { \sum \limits
        _{ \small S \in Summ_{ ref } } 
        \sum \limits
            _{ \small \text{N-gram} \in S } 
            Count( \text{N-gram} ) }
$

In [31]:
def rouge_n(n, y_pred, y_true):
    n_gram_pred = set(ngrams(y_pred, n))
    n_gram_true = set(ngrams(y_true, n))
    return len(n_gram_pred & n_gram_true) / len(n_gram_true)