In [1]:
# import multiprocessing

import numba
import numpy as np

from nltk import tokenize
from nltk.util import ngrams

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances

In [2]:
@numba.njit
def jaccard_similarity(a, b):
    union = np.sum(a | b)
    if not union:
        return 1.0
    return np.sum(a & b) / union


@numba.njit
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


@numba.njit
def cohesion(chromosome, similarity, document):
    total = 0
    for p in np.unique(chromosome):
        sents = document[chromosome == p]
        k = len(sents)
        #: combinations choose 2
        for i in range(k-1):
            for j in range(i+1, k):
                total += similarity(sents[i], sents[j]) / len(sents)  
    return total


@numba.njit
def separation(chromosome, similarity, document):
    total = 0
    k = len(np.unique(chromosome))
    #: combinations choose 2
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = document[chromosome == p]
            sents_q = document[chromosome == q]
            #: product
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += similarity(sents_p[i], sents_q[j]) / m / n
    return total


@numba.njit
def cohesion_separation(chromosome, similarity, document):
    coh = cohesion(chromosome, similarity, document)
    sep = separation(chromosome, similarity, document)
    return (1 + sigmoid(coh)) ** sep

In [31]:
def init_chromosome(choices, length):
    chrom = np.full(length, -1)
    #: ensure that each choice is accounted for at least once
    idxs = np.random.choice(np.arange(length), len(choices), replace=False)
    chrom[idxs] = np.random.permutation(choices)
    idxs = np.where(chrom == -1)[0]
    chrom[idxs] = np.random.choice(choices, len(idxs))
    return chrom


def init_population(population_size, cluster_amount, chromosome_length):
    clusts = np.arange(cluster_amount)
    chroms = [init_chromosome(clusts, chromosome_length) for _ in range(population_size)]
    pop = np.vstack(chroms)
    return pop


@numba.njit
def get_distinct(n, s):
    np.array([np.random.choice(sorted(s - {i}), size=3, replace=False) for i in n])

def get_offspring_distinct(population, randoms, lambda_, crossover_rate):
    n = np.arange(len(population))
    s = set(n)
    idxs = np.array([np.random.choice(tuple(s - {i}), size=3, replace=False) for i in n])
    chrom_1, chrom_2, chrom_3 = map(np.squeeze, np.split(population[idxs], 3, axis=1))
    k = len(np.unique(population))
    offspr = (chrom_1 + lambda_ * (chrom_2 - chrom_3)) % k
    mask = randoms < crossover_rate
    offspr[mask] = population[mask]
    return offspr


def get_offspring(population, randoms, lambda_, crossover_rate):
    #: For computation time, relax requirement that X_r, X_r1, X_r2, X_r3 are distinct. 
    #: With large population size, this is unlikely to occur, and if it does, it doesn't
    #: seem that detrimental. Also is this mitigated with appropriate lam choice?
    n = len(population)
    idxs = np.random.choice(np.arange(n), size=(n, 3))
    chrom_1, chrom_2, chrom_3 = map(np.squeeze, np.split(population[idxs], 3, axis=1))
    k = len(np.unique(population))
    offspr = (chrom_1 + lambda_ * (chrom_2 - chrom_3)) % k
    mask = randoms < crossover_rate
    offspr[mask] = population[mask]
    return offspr


def next_generation(population, offspring, func):
    fit_off = np.array([func(chrom) for chrom in offspring])
    fit_pop = np.array([func(chrom) for chrom in population])
    mask = fit_off > fit_pop
    population[mask] = offspring[mask]
    return


def mutate(population, randoms):
    mask = randoms < sigmoid(population)
    #: inversion operator
    idxs = np.nonzero(mask)
    arr = np.array(idxs)
    sorter = np.lexsort((-arr[1], arr[0]))
    rev = arr.T[sorter].T
    population[idxs] = population[(rev[0], rev[1])]
    return

In [72]:
#TODO: early stopping --> little fitness improvement over x generations, good enough fitness score
def run_iterations(pop_size, summ_len, num_sents, func, lam, cr, iterations, *, mutate_after=True,
                   seed=None, verbose=False, save_rate=np.nan, save_dir=None):
    
    if save_dir is not None:
        save_dir = pathlib.Path(save_dir)
        if not save_dir.is_dir():
            msg = f'save_dir={save_dir} not a valid directory path'.format(save_dir=save_dir)
            raise NotADirectoryError(msg)
    
    if seed is not None:
        np.random.seed(seed)
        
    if isinstance(summ_len, int):
        if not (0 < summ_len < num_sents):
            raise ValueError('int summ_len must be between 0 and the number of sentences in the document')
    elif isinstance(summ_len, float):
        if not (0.0 < summ_len < 1.0):
            raise ValueError('float summ_len must be between 0.0 and 1.0')
        summ_len = int(summ_len * num_sents)
    else:
        raise TypeError('summ_len must be a float or int')
        
    if pop_size < 10:
        raise ValueError('pop_size must be at least 10')
    pop_size = int(pop_size)
    
    
    
    pop = init_population(pop_size, summ_len, num_sents)
    shape = pop.shape
    for i in range(iterations):
        if i % save_rate == 0:
            file = save_dir / 'generation_{i:0>pad}'.format(i=i, pad=len(str(iterations)))
            np.save(file, pop)
            
        if verbose:
            print(i)  #TODO: logfile --> iteration number, best fitness score, avg fitness score, hyper-params
        
        rand = np.random.random_sample(shape)
        
        t0 = time.time()
        offspr = get_offspring_distinct(pop, rand, lam, cr)
        t1 = time.time()
        PROFILER['offspring'] += t1 - t0
        
        #: option since papers unclear if mutate at offspring or survivors stage
        if not mutate_after:
            mutate(offspr, rand)
            
        t0 = time.time()
        next_generation(pop, offspr, func)
        t1 = time.time()
        PROFILER['generation'] += t1 - t0
        
        if mutate_after:
            t0 = time.time()
            mutate(pop, rand)
            t1 = time.time()
            PROFILER['mutate'] += t1 - t0
    
    return pop

In [73]:
def best_chromosome(population):
    #TODO: make sure it picks one with all k-clusters
    fits = np.argmax([fitness(chrom) for chrom in population])
    chrom = population[fits]
    return chrom
    

def central_sentences(chromosome, document, metric=cosine_distances):
    central_sents = []
    for cluster in np.unique(chromosome):
        idxs = np.where(chromosome == cluster)[0]
        sents = document[idxs]
        centroid = sents.mean(axis=0)[np.newaxis,:]
        dists = metric(sents, centroid)
        cent_sent = idxs[np.argmin(dists)]
        central_sents.append(cent_sent)
    return sorted(central_sents)

In [74]:
def rouge_n(n, y_pred, y_true):
    n_gram_pred = set(ngrams(y_pred, n))
    n_gram_true = set(ngrams(y_true, n))
    return len(n_gram_pred & n_gram_true) / len(n_gram_true)

# Run Algorithm

In [75]:
path = cwd / 'the_raven.txt'
with open(path) as fp:
    text = fp.read()
text[:100]

'  Once upon a midnight dreary, while I pondered, weak and weary,\n  Over many a quaint and curious vo'

In [229]:
import json
import pathlib

cwd = pathlib.Path.cwd()
data = cwd / 'data'
jsons = data / 'jsons'
json_2018 = jsons / '2018' / '2018.json'

with open(json_2018) as fp:
    articles_2018 = json.load(fp)['2018']

article = articles_2018[333]
text = article['story']
summ_true = article['summary']
# print(text)

In [230]:
print(text)

Although President Trump said Monday that he would be willing to meet with Iranian President Hassan Rouhani at "any time," it looks like that meeting won't be happening any time soon — multiple Iranian officials have played down the possibility of a sit-down, without ruling it out entirely.

The leaders of the U.S. and Iran have not met in person since before the Islamic Revolution in 1979. A phone call between Rouhani and former President Obama in 2013 was the first direct conversation of any kind in decades. That call was part of the lengthy negotiations toward the multinational nuclear deal reached with Iran in 2015 — which Trump pulled out of earlier this year.

As a result of Trump's decision to withdraw from the agreement, some previously lifted U.S. sanctions will be reimposed on Iran over the next few months.


"COLOR US UNIMPRESSED," Iranian foreign minister Javad Zarif responded on Twitter. "The world heard even harsher bluster a few months ago."

The bluster was gone this we

In [231]:
summ_true

" The leaders of the U.S. and Iran haven't met in person since before the Islamic Revolution in 1979. Trump says he's ready to sit down; Iranian President Hassan Rouhani hasn't expressed any interest."

In [241]:
import time
import collections


PROFILER = collections.Counter()


def fitness(chromosome):
    return cohesion_separation(chromosome, jaccard_similarity, doc)

cv = CountVectorizer(stop_words='english')
sents_lower = tokenize.sent_tokenize(text.lower())
# sents_lower = [t.strip() for t in tokenize.line_tokenize(text.lower())]
vec = cv.fit_transform(sents_lower)
doc = vec.toarray().astype(bool).astype(int)

ratio = 0.1

t0 = time.time()
pop = run_iterations(pop_size=100, summ_len=ratio, num_sents=len(doc), 
                     func=fitness, lam=0.9, cr=0.5, iterations=1000, verbose=False, seed=None)
t1 = time.time()
print(t1-t0)


chrom_best = best_chromosome(pop)
idxs = central_sentences(chrom_best, doc)
#: 2018 article[999] -- gay skier winter olympics -> no lower() causes wierd sentence breaks that throw off summary
# sents_norm = tokenize.sent_tokenize(text)
summ_evol = '\n'.join(np.array(sents_lower)[idxs])

PROFILER

23.618157863616943


Counter({'offspring': 2.972149610519409,
         'generation': 20.425386905670166,
         'mutate': 0.18253326416015625})

In [233]:
# get the uppercase correct
summ_evol = []
for sent in np.array(sents_lower)[idxs]:
    start = text.lower().index(sent)
    stop = start + len(sent)
    summ_evol.append(text[start:stop])
summ_evol = '\n'.join(summ_evol)
print(summ_evol)

"Talks with the US would have been right had the US not withdrawn from the JCPOA and imposed sanctions on Iran," Ali Motahhari said, according to Iran's state-run IRNA news outlet.
The Associated Press reports:

Kenyon reports that Rouhani himself — who has not publicly responded to the invitation — told parliament that the U.S. has proven it doesn't keep its promises and that Iran will protect its right to export oil, which is the target of U.S. sanctions.


In [234]:
def scores(n, y_pred):
    for i in range(1, n+1):
        score = rouge_n(i, y_pred, summ_true)
        print(i, score)

In [235]:
print('evolutionary')
scores(3, summ_evol)

evolutionary
1 0.8571428571428571
2 0.7222222222222222
3 0.3735632183908046


In [236]:
import gensim

summ_gensim = gensim.summarization.summarize(text, ratio=ratio)
print('gensim')
scores(3, summ_gensim)

gensim
1 0.8857142857142857
2 0.7142857142857143
3 0.367816091954023


In [237]:
print(summ_gensim)

The vice speaker of Iran's parliament also told reporters that the U.S. withdrawal from the 2015 deal, known as the Joint Comprehensive Plan of Action, made new negotiations a nonstarter.
Kenyon reports that Rouhani himself — who has not publicly responded to the invitation — told parliament that the U.S. has proven it doesn't keep its promises and that Iran will protect its right to export oil, which is the target of U.S. sanctions.


In [238]:
import string

from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.tokenizers import Tokenizer



def format_sumy(sumy_summary):
    return ''.join(char for sent in sumy_summary for char in str(sent) if char in string.printable)

parser = PlaintextParser(text, Tokenizer('english'))

text_rank = TextRankSummarizer()
summ_text_rank = format_sumy(text_rank(parser.document, len(idxs)))
print('sumy - text rank')
scores(3, summ_text_rank)

print()

lex_rank = LexRankSummarizer()
summ_lex_rank = format_sumy(lex_rank(parser.document, len(idxs)))
print('sumy - lex rank')
scores(3, summ_lex_rank)

sumy - text rank
1 0.8571428571428571
2 0.7063492063492064
3 0.3620689655172414

sumy - lex rank
1 0.8
2 0.6587301587301587
3 0.39655172413793105


In [239]:
print(summ_text_rank)



In [240]:
print(summ_lex_rank)

Kenyon reports that Rouhani himself  who has not publicly responded to the invitation  told parliament that the U.S. has proven it doesn't keep its promises and that Iran will protect its right to export oil, which is the target of U.S. sanctions.Kenyon says there will be some pressure on Iran to negotiate with the U.S.  but he puts that in perspective.
