In [1]:
import collections
import itertools
import functools
import math
import operator
import string
import random

import numpy as np

from nltk import tokenize
from nltk.util import ngrams

from scipy.special import expit as sigmoid

In [2]:
import json
import pathlib

cwd = pathlib.Path.cwd()
data = cwd / 'data'
jsons = data / 'jsons'
json_2018 = jsons / '2018'

json_2018 = list(json_2018.iterdir())[0]

In [3]:
with open(json_2018) as fp:
    articles_2018 = json.load(fp)['2018']

article = articles_2018[4]
text = article['story'].lower()
print(text)

people who don't get insurance through their jobs will now be able to buy short-term policies that may be cheaper than affordable care act coverage. these plans won't have to cover as many medical services and are exempt from covering people with pre-existing conditions.

the departments of health and human services, labor and treasury announced new rules wednesday that make it easier for consumers to replace aca insurance with these short-term policies.

the policies were originally limited to three months, but they can now last up to a year, and be renewed to last as long as three years. the plans have been a priority of president trump, who says he wants consumers to have access to cheaper health insurance.

short-term plans don't have to meet the affordable care act's consumer protection and coverage requirements, so many will not cover services such as mental health care or prescription drugs. and insurance companies can deny customers coverage on these plans if they have a pre-ex

# Evolutionary Algorithms
- [Discrete Differential Evolution for Text Summarization](https://www.researchgate.net/publication/281662415_Discrete_Differential_Evolution_for_Text_Summarization)
- [Evolutionary Algorithm for Extractive Text Summarization](https://www.researchgate.net/profile/Ramiz_Aliguliyev/publication/220518077_Evolutionary_Algorithm_for_Extractive_Text_Summarization/links/09e4151356fc2caab6000000.pdf)
- [An Improved Evolutionary Algorithm for Extractive Text Summarization](https://link.springer.com/chapter/10.1007/978-3-642-36543-0_9)

## Sentence Clustering

In [4]:
def distinct_words(text):
    no_punctuation = ''.join(t for t in text if t not in string.punctuation)
    return frozenset(tokenize.word_tokenize(no_punctuation))

Let document $ D $ be decomposed into a set of $ n $ sentences.  
$
D = \{ S_1, S_2, ..., S_n \}
$

In [5]:
D = document_sentences = set(tokenize.sent_tokenize(text))

Let terms $ T $ be the set of all $ m $ distinct words in $ D $.  
$
T = \{ t_1, t_2, ..., t_m \}
$

In [6]:
T = document_distinct_words = distinct_words(text)

Let $ S_i $ represent the set of distinct terms in sentence $ S_i $ with 
$ m_i $ distinct terms.  
$
S_i = \{ t_1, t_2, ..., t_{ m_i } \}
$

In [7]:
S = sentence_distinct_words = {distinct_words(ds) for ds in document_sentences}

## Similarity Measures

### Jaccard Coefficient Similarity Measure

$
\large
sim_{ jaccard } ( S_i, S_j ) = 
\dfrac
    { | S_i \cap S_j | }
    { | S_i \cup S_j | }
$

In [8]:
def jaccard_similarity(a, b):
    a, b = set(a), set(b)
    if not a and not b:
        return 1.0
    return len(a & b) / len(a | b)

In [9]:
a = "however doctors learned that long inactivity did more harm than good".split()
b = "patients got out of shape developed blood clots and became demoralized".split()

jaccard_similarity(a, b)

0.0

### Normalized Google Metrics

$
\large
\text{NGD} ( t_k, t_l ) = \dfrac 
    { \text{max} \big\{
        \text{log} ( f_k ), 
        \text{log} ( f_l )
    \big\}
    - \text{log} ( f_{ lk } ) }
    { \text{log} ( n ) 
    - \text{min} \big\{
        \text{log} ( f_k ), 
        \text{log} ( f_l )
    \big\} }
$

where:
- $ t_k $ and $ t_l $ are terms 
- $ f_k $ is the number of sentences containing $ t_k $
- $ f_{ kl } $ is the number of sentences containing both $ t_k $ and $ t_l $
- $ n $ is the total number of sentences

$
\large 
sim_{ \text{NGD} } ( t_k, t_l ) = \text{exp} 
    \big( - \text{NGD} ( t_k, t_l ) \big)
$

$
\large 
sim_{ \text{NGD} } ( S_i, S_j ) = 
\dfrac
    { \sum\limits
        _{ \small t_k \in S_i } 
        \sum\limits
            _{ \small t_l \in S_j } 
            sim_{ \text{NGD} } ( t_k, t_l ) }
    { m_i m_j }
$  

where:
- $ S_i $ and $ S_j $ are sentences
- $ m_i $ is the number of words in $ S_i $

In [10]:
class NormalizedGoogle:
    def __init__(self, document):
        self.sentence_words = tuple(distinct_words(sent) for sent in tokenize.sent_tokenize(document))
        
    # double check scientific paper's handling of "bad" log values
    def distance(self, term_k, term_l):
        freq_k = sum(term_k in sent for sent in self.sentence_words)
        freq_l = sum(term_l in sent for sent in self.sentence_words)
        if not (freq_k and freq_l):
            raise ValueError('terms must be in document')

        freq_kl = sum((term_k in sent) and (term_l in sent) for sent in self.sentence_words)
        if (freq_k > 0) and (freq_l > 0) and (freq_kl == 0):
            return 1.0

        logs_k_l = (math.log(freq_k), math.log(freq_l))
        n = len(self.sentence_words)

        numerator = max(logs_k_l) - math.log(freq_kl)
        denominator = math.log(n) - min(logs_k_l)
        return numerator / denominator
    
    def term_similarity(self, term_k, term_l):
        dist = self.distance(term_k, term_l)
        return math.exp(-dist)
    
    def sentence_similarity(self, sent_i, sent_j):
        total = sum(self.term_similarity(term_k, term_l)
                    for term_k, term_l in itertools.product(sent_i, sent_j))
        return total / len(sent_i) / len(sent_j)

## Objective Functions

Let $ C $ be a partition of $ D $ with $ k $ clusters.  
$ C = \{ C_1, C_2, ..., C_k \} $  

where:
- $ C_p \cap C_q = \emptyset, 
        \forall p \ne q \in \{ 1, 2, ..., k \} 
  $
- $ \bigcup\limits
    _{ p = 1 }
    ^k C_p = D 
  $
- $ C_p \ne \emptyset, 
        \forall p \in \{ 1, 2, ..., k \}
  $

In [11]:
def verify_clusters(clusters, document):
    disjoint = all(not cluster_i & cluster_j for cluster_i, cluster_j in itertools.combinations(clusters, r=2))
    union = functools.reduce(operator.or_, clusters) == document
    nonempty = all(cluster for cluster in clusters)
    if not (disjoint and union and nonempty):
        raise ValueError('clusters do not form a partition')

In [12]:
def clusterize(chromosome):
    partition = collections.defaultdict(list)
    for i, cluster in enumerate(chromosome):
        partition[cluster].append(i)
    return partition

### Sigmoid Function
$
\large
sigm ( x ) = 
\dfrac
    { 1 }
    { 1 + \text{exp} ( -x ) }
$

In [13]:
from scipy.special import expit as sigmoid
# interesting this is 3x faster than my implimentation

### Intra-Cluster Similiarity (Cohesion)

$
\large
F_1 = 
\sum\limits
    _{ \small p = 1 }
    ^{ \small k } 
\sum\limits
    _{ \small S_i, S_j \in C_p } 
\dfrac 
    { sim ( S_i, S_j ) } 
    { | C_p | } 
\rightarrow \text{max}
$

*(__Note:__ Evol Alg for Ext Txt Summ doesn't show division of $|C_p|$ but I believe this to be a typo since the paragraph detailing it says "the average sum". Additionally the Disc Diff Evol for Txt Summ shows it as such.)*

In [14]:
def cohesion(chromosome, sentences, sim):
    total = 0
    clusters = clusterize(chromosome)
    for cluster in clusters.values():
        for i, j in itertools.combinations(cluster, r=2):
            sent_i, sent_j = sentences[[i,j]]
            total += sim(sent_i, sent_j) / len(cluster)
    return total

### Inter-Cluster Dissimilarity (Separation)

$
\large
F_2 = 
\sum\limits
    _{ \small p = 1 }
    ^{ \small k - 1 }
\sum\limits
    _{ \small q = p + 1 }
    ^{ \small k } 
\sum\limits
    _{ \small S_i \in C_p } 
\sum\limits
    _{ \small S_j \in C_q } 
\dfrac 
    { sim ( S_i, S_j ) } 
    { | C_p | \cdot | C_q | }
\rightarrow \text{min}
$

In [15]:
def separation(chromosome, sentences, sim):
    total = 0
    clusters = clusterize(chromosome)
    for cluster_p, cluster_q in itertools.combinations(clusters.values(), r=2):
        for i, j in itertools.product(cluster_p, cluster_q):
            sent_i, sent_j = sentences[[i,j]]
            total += sim(sent_i, sent_j) / len(cluster_p) / len(cluster_q)
    return total

### Inter/Intra-Cluster Balance

$
\large
F = \big( 1 + sigm ( F_1 ) \big)
    ^{ F_2 } \rightarrow \text{max}
$

In [16]:
def cohesion_separation(chromosome, sentences, sim):
    coh = cohesion(chromosome, sentences, sim)
    sep = separation(chromosome, sentences, sim)
    return pow(1 + sigmoid(coh), sep)

## Fitness

$
\large
fitness_1 \big( X_a ( t ) \big) = F_1 \big( X_a ( t ) \big)
$

$
\large
fitness_2 \big( X_a ( t ) \big) = \dfrac{ 1 }{ F_2 ( X_a ( t ) ) }
$

$
\large
fitness \big( X_a ( t ) \big) = F \big( X_a ( t ) \big)
$

In [17]:
def fitness_1(chromosome, sentences, sim):
    return cohesion(chromosome, sentences, sim)

def fitness_2(chromosome, sentences, sim):
    return 1 / separation(chromosome, sentences, sim)

def fitness(chromosome, sentences, sim):
    return cohesion_separation(chromosome, sentences, sim)

# Modified Discrete Differential Evolution Algorithm

Initialize the population with $ N $ chromosomes each composed of $ n $ random integers from \[1, k\].  

$
X_r ( t ) = [ x_{ r, 1 } ( t ), x_{ r, 2 } ( t ), ..., x_{ r, n } ( t ) ]
$  

where:
- $ x_{ r, s } ( t ) \in \{ 1, 2, ..., k \} $
- $ r = 1, 2, ..., N $
- $ s = 1, 2, ..., n $
- $ N $ is the population size
- $ n $ is the number of sentences _(in the document)_
- $ k $ is the number of clusters _(number of sentences for summary)_
- $ t $ is the iteration step

In [18]:
#TODO: create more robust algorithm; however this is only run once for initialization
def assign_clusters(num_of_clusters, num_of_sentences):
    chromosome = []
    while not all(num in chromosome for num in range(num_of_clusters)):
        chromosome = np.random.choice(range(num_of_clusters), num_of_sentences)
    return chromosome
    

def initialize_population(population_size, num_of_clusters, num_of_sentences):
    if num_of_clusters > num_of_sentences:
        raise ValueError('num_of_clusters cannot be greater than num_of_sentences')
    population = np.array([assign_clusters(num_of_clusters, num_of_sentences) for _ in range(population_size)])
    return population

$
\large
y_{ r, s } ( t + 1 ) = 
\begin{cases}
    x_{ r1, s } ( t ) + \lambda \big( x_{ r2, s } ( t ) - x_{ r3, s } ( t ) \big) 
        & \text{if } rand_s < \text{CR} \\
    x_{ r, s } ( t ) & \text{otherwise}
\end{cases}
$

where  
- For each $ X_r $, randomly sample $ X_{ r1 } ( t ), X_{ r2 } ( t ), X_{ r3 } ( t ) $ 
  from the same generation _(each distinct)_
- $ rand_s $ is uniformally distributed random numbers from $ [ 0, 1 ] $ 
  chosen once for each $ s \in \{ 1, 2, ..., n \} $

hyper-parameters 
- $ \lambda $ is a scale factor from $ [ 0, 1 ] $
- $ \text{CR} $ is the crossover rate from $ [ 0, 1 ] $

In [19]:
def generate_offspring(population, scaling_factor, crossover_rate, randoms):
    population_size, num_of_sentences = population.shape
    population_idxs = frozenset(range(population_size))
    num_of_clusters = np.max(population)
    offspring = population.copy()
    
    for i, (chrom, random) in enumerate(zip(population, randoms)):
        #: must select chrom != chrom_l != chrom_m != chrom_n from population
        choices = tuple(population_idxs - {i})
        idxs = np.random.choice(choices, size=3, replace=False)
        chrom_l, chrom_m, chrom_n = population[idxs]
        
        chromosomes = zip(chrom, chrom_l, chrom_m, chrom_n, random)
        for j, (gene, gene_l, gene_m, gene_n, rand) in enumerate(chromosomes):
            if rand < crossover_rate:
                new_gene = (gene_l + scaling_factor * (gene_m - gene_n)) % num_of_clusters
                offspring[i,j] = new_gene

    return offspring


def randomize(population):
    return np.random.random_sample(population.shape)

$
\large
X_r(t+1) = 
\begin{cases}
    Y_r ( t + 1 ) & \text{if } f \big( Y_r ( t + 1 ) \big) > f \big( X_r ( t ) \big) \\
    X_r ( t ) & \text{otherwise}
\end{cases}
$

where 
- $ f ( \cdot ) $ is the objective function to be maximized

In [20]:
def next_generation(population, offspring, fitness):
    next_gen = population.copy()
    for i, (chrom_pop, chrom_off) in enumerate(zip(population, offspring)):
        if fitness(chrom_off) > fitness(chrom_pop):
            next_gen[i] = chrom_off
    return next_gen

## Mutation

At each iteration $ t + 1 $ for each $ X_r ( t ) $ creates
$ m_r ( t + 1 ) = [ m_{ r, 1 } ( t ), m_{ r, 2 } ( t ), ..., m_{ r, n } ( t ) ] $.  
For each gene, 1 indicates no mutation and 0 means mutate.

$
\large
m_{ r, s } ( t + 1 ) = 
\begin{cases}
    1 & \text{if } rand_s < sigm \big( y_{ r, s } ( t + 1 ) \big) \\
    0 & \text{otherwise}
\end{cases}
$

In [21]:
def mutate(offspring, randoms):
    mutations = np.empty_like(offspring)    
    for i, (gene, rand) in enumerate(zip(offspring.ravel(), randoms.ravel())):
        mutations.ravel()[i] = rand < sigmoid(gene)
    return mutations

## Inversion Operator

### Figure 1 - Psuedo-code
<img src="./data/pngs/fig1_-_inversion_operator_psuedo_code.png" alt="Fig 1. Inverse operator psuedo-code" width="33%" align="left"/>

In [22]:
def inversion_operator(chromosome, mutation):
    next_gen = chromosome.copy()
    idxs = set(i for i, mutate in enumerate(mutation) if not mutate)
    
    while len(idxs):
        i_min, i_max = min(idxs), max(idxs)
        next_gen[i_min] = chromosome[i_max]
        next_gen[i_max] = chromosome[i_min]
        idxs -= {i_max, i_min}
    
    return next_gen

In [23]:
def is_partition(clusters, document):
    disjoint = all(not cluster_i & cluster_j for cluster_i, cluster_j in itertools.combinations(clusters, r=2))
    union = functools.reduce(operator.or_, clusters) == document
    nonempty = all(cluster for cluster in clusters)
    if not (disjoint and union and nonempty):
        raise ValueError('clusters do not form a partition')

### Figure 2 - Example
<img src="./data/pngs/fig2_-_inversion_operator_diagram.png" alt="Fig 2. Inverse operator example" width="33%" align="left"/>

In [24]:
# testing Fig 2. input/output
chromosome = [3, 2, 4, 2, 3, 1, 4, 1]
mutation = [0, 1, 1, 0, 1, 0, 0, 1]
next_gen = [4, 2, 4, 1, 3, 2, 3, 1]

assert inversion_operator(chromosome, mutation) == next_gen

## Run Iterations

In [28]:
def evolve_generations(population, criterion, scaling_factor, crossover_rate, iterations):
    for idx in range(iterations):
        print(idx)
        randoms = randomize(population)
        offspring = generate_offspring(population, scaling_factor, crossover_rate, randoms)
        generation = next_generation(population, offspring, criterion)
        mutations = mutate(generation, randoms)
        for i, (chromosome, mutation) in enumerate(zip(generation, mutations)):
            generation[i] = inversion_operator(chromosome, mutation)
        population = generation
    return population


scaling_factor = 0.9
crossover_rate = 0.5
criterion = functools.partial(fitness, sentences=np.array(list(document_sentences)), sim=jaccard_similarity)
population = initialize_population(population_size=100, num_of_clusters=5, num_of_sentences=len(document_sentences))
evolved = evolve_generations(population, criterion, scaling_factor, crossover_rate, iterations=1000)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33


KeyboardInterrupt: 

## ROUGE-N
_(Recall Oriented Understudy for Gisting Evaluation)_

$
\large
\text{ROUGE-N} = 
\dfrac
    { \sum \limits
        _{ \small S \in Summ_{ ref }} 
        \sum \limits
        _{ \small \text{N-gram} \in S } 
            Count_{ \small match } ( \text{N-gram} ) }
    { \sum \limits
        _{ \small S \in Summ_{ ref } } 
        \sum \limits
            _{ \small \text{N-gram} \in S } 
            Count( \text{N-gram} ) }
$

In [31]:
def rouge_n(n, y_pred, y_true):
    n_gram_pred = set(ngrams(y_pred, n))
    n_gram_true = set(ngrams(y_true, n))
    return len(n_gram_pred & n_gram_true) / len(n_gram_true)

https://rare-technologies.com/text-summarization-in-python-extractive-vs-abstractive-techniques-revisited/#how_to_evaluate_text

In [32]:
y_true = 'a good diet must have apples and bananas'.split()
y_pred = 'apples and bananas are must for a good diet'.split()

assert rouge_n(1, y_pred, y_true) == 7 / 8
assert rouge_n(2, y_pred, y_true) == 4 / 7