In [2]:
import itertools
import multiprocessing

import numba
import numpy as np

from nltk import tokenize
from nltk.util import ngrams

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances

In [3]:
@numba.njit
def jaccard_similarity(a, b):
    union = np.sum(a | b)
    if not union:
        return 1.0
    return np.sum(a & b) / union


@numba.njit
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


@numba.njit
def cohesion(chromosome, similarity, document):
    total = 0
    for p in np.unique(chromosome):
        sents = document[chromosome == p]
        k = len(sents)
        #: combinations choose 2
        for i in range(k-1):
            for j in range(i+1, k):
                total += similarity(sents[i], sents[j]) / len(sents)  
    return total


@numba.njit
def separation(chromosome, similarity, document):
    total = 0
    k = len(np.unique(chromosome))
    #: combinations choose 2
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = document[chromosome == p]
            sents_q = document[chromosome == q]
            #: product
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += similarity(sents_p[i], sents_q[j]) / m / n
    return total


@numba.njit
def cohesion_separation(chromosome, similarity, document):
    coh = cohesion(chromosome, similarity, document)
    sep = separation(chromosome, similarity, document)
    return (1 + sigmoid(coh)) ** sep

In [4]:
def init_chromosome(choices, length):
    chrom = np.full(length, -1)
    #: ensure that each choice is accounted for at least once
    idxs = np.random.choice(np.arange(length), len(choices), replace=False)
    chrom[idxs] = np.random.permutation(choices)
    idxs = np.where(chrom == -1)[0]
    chrom[idxs] = np.random.choice(choices, len(idxs))
    return chrom


def init_population(population_size, cluster_amount, chromosome_length):
    clusts = np.arange(cluster_amount)
    chroms = [init_chromosome(clusts, chromosome_length) for _ in range(population_size)]
    pop = np.vstack(chroms)
    return pop


def get_offspring_distinct(population, randoms, lambda_, crossover_rate):
    n = np.arange(len(population))
    s = set(n)
    idxs = np.array([np.random.choice(tuple(s - {i}), size=3, replace=False) for i in n])
    chrom_1, chrom_2, chrom_3 = map(np.squeeze, np.split(population[idxs], 3, axis=1))
    k = len(np.unique(population))
    offspr = (chrom_1 + lambda_ * (chrom_2 - chrom_3)) % k
    mask = randoms < crossover_rate
    offspr[mask] = population[mask]
    return offspr


def get_offspring(population, randoms, lambda_, crossover_rate):
    #: For computation time, relax requirement that X_r, X_r1, X_r2, X_r3 are distinct. 
    #: With large population size, this is unlikely to occur, and if it does, it doesn't
    #: seem that detrimental. Also is this mitigated with appropriate lam choice?
    n = len(population)
    idxs = np.random.choice(np.arange(n), size=(n, 3))
    chrom_1, chrom_2, chrom_3 = map(np.squeeze, np.split(population[idxs], 3, axis=1))
    k = len(np.unique(population))
    offspr = (chrom_1 + lambda_ * (chrom_2 - chrom_3)) % k
    mask = randoms < crossover_rate
    offspr[mask] = population[mask]
    return offspr


# def next_generation(population, offspring, func):
#     fit_off = np.array([func(chrom) for chrom in offspring])
#     fit_pop = np.array([func(chrom) for chrom in population])
#     mask = fit_off > fit_pop
#     population[mask] = offspring[mask]
#     return


def next_generation(population, offspring, func, pool):
    fits = pool.map(func, itertools.chain(population, offspring))
    l = len(population)
    fit_pop = np.array(fits[:l])
    fit_off = np.array(fits[l:])
    mask = fit_off > fit_pop
    population[mask] = offspring[mask]
    return


def mutate(population, randoms):
    mask = randoms < sigmoid(population)
    #: inversion operator
    idxs = np.nonzero(mask)
    arr = np.array(idxs)
    sorter = np.lexsort((-arr[1], arr[0]))
    rev = arr.T[sorter].T
    population[idxs] = population[(rev[0], rev[1])]
    return

In [35]:
#TODO: early stopping --> little fitness improvement over x generations, good enough fitness score
def run_iterations(pop_size, summ_len, num_sents, func, lam, cr, iterations, *, mutate_after=True,
                   seed=None, verbose=False, save_rate=np.nan, save_dir=None):
    
    print(summ_len)
    
    if save_dir is not None:
        save_dir = pathlib.Path(save_dir)
        if not save_dir.is_dir():
            msg = f'save_dir={save_dir} not a valid directory path'.format(save_dir=save_dir)
            raise NotADirectoryError(msg)
    
    if seed is not None:
        np.random.seed(seed)
        
    if isinstance(summ_len, int):
        print('int')
        if not (0 < summ_len < num_sents):
            raise ValueError('int summ_len must be between 0 and the number of sentences in the document')
    elif isinstance(summ_len, float):
        print('flt')
        if not (0.0 < summ_len < 1.0):
            raise ValueError('float summ_len must be between 0.0 and 1.0')
        summ_len = int(summ_len * num_sents)
    else:
        raise TypeError('summ_len must be a float or int')
        
    if pop_size < 10:
        raise ValueError('pop_size must be at least 10')
    pop_size = int(pop_size)
    
    print(summ_len)
    
    
    pop = init_population(pop_size, summ_len, num_sents)
    shape = pop.shape
    pool = multiprocessing.Pool()
    
    return pop
    
    for i in range(iterations):
        if i % save_rate == 0:
            file = save_dir / 'generation_{i:0>pad}'.format(i=i, pad=len(str(iterations)))
            np.save(file, pop)
            
        if verbose:
            print(i)  #TODO: logfile --> iteration number, best fitness score, avg fitness score, hyper-params
        
        rand = np.random.random_sample(shape)
        
        t0 = time.time()
        offspr = get_offspring_distinct(pop, rand, lam, cr)
        t1 = time.time()
        PROFILER['offspring'] += t1 - t0
        
        #: option since papers unclear if mutate at offspring or survivors stage
        if not mutate_after:
            mutate(offspr, rand)
            
        t0 = time.time()
        next_generation(pop, offspr, func, pool)
#         next_generation(pop, offspr, func)
        t1 = time.time()
        PROFILER['generation'] += t1 - t0
        
        if mutate_after:
            t0 = time.time()
            mutate(pop, rand)
            t1 = time.time()
            PROFILER['mutate'] += t1 - t0
    
    pool.terminate()
    return pop

In [6]:
def best_chromosome(population):
    #TODO: make sure it picks one with all k-clusters
    fits = np.argmax([fitness(chrom) for chrom in population])
    chrom = population[fits]
    return chrom
    

def central_sentences(chromosome, document, metric=cosine_distances):
    central_sents = []
    for cluster in np.unique(chromosome):
        idxs = np.where(chromosome == cluster)[0]
        sents = document[idxs]
        centroid = sents.mean(axis=0)[np.newaxis,:]
        dists = metric(sents, centroid)
        cent_sent = idxs[np.argmin(dists)]
        central_sents.append(cent_sent)
    return sorted(central_sents)

In [7]:
def rouge_n(n, y_pred, y_true):
    n_gram_pred = set(ngrams(y_pred, n))
    n_gram_true = set(ngrams(y_true, n))
    return len(n_gram_pred & n_gram_true) / len(n_gram_true)

# Run Algorithm

In [10]:
import pathlib

cwd = pathlib.Path.cwd()
path = cwd / 'data' / 'grinch.txt'
with open(path) as fp:
    text = fp.read()
text[:100]

'Every Who Down in Whoville Liked Christmas a lot...\nBut the Grinch,Who lived just north of Whoville,'

In [318]:
import json
import pathlib

cwd = pathlib.Path.cwd()
data = cwd / 'data'
jsons = data / 'jsons'
json_2018 = jsons / '2018' / '2018.json'

with open(json_2018) as fp:
    articles_2018 = json.load(fp)['2018']

article = articles_2018[863]
text = article['story']
summ_true = article['summary']
# print(text)

In [11]:
print(text)

Every Who Down in Whoville Liked Christmas a lot...
But the Grinch,Who lived just north of Whoville, Did NOT!
The Grinch hated Christmas! The whole Christmas season!
Now, please don't ask why. No one quite knows the reason.
It could be his head wasn't screwed on just right.
It could be, perhaps, that his shoes were too tight.
But I think that the most likely reason of all,
May have been that his heart was two sizes too small.
Whatever the reason, His heart or his shoes,
He stood there on Christmas Eve, hating the Whos,
Staring down from his cave with a sour, Grinchy frown,
At the warm lighted windows below in their town.
For he knew every Who down in Whoville beneath,
Was busy now, hanging a mistletoe wreath.
"And they're hanging their stockings!" he snarled with a sneer,
"Tomorrow is Christmas! It's practically here!"
Then he growled, with his Grinch fingers nervously drumming,
"I MUST find some way to stop Christmas from coming!"
For Tomorrow, he knew, all the Who girls and boys,
Wou

In [320]:
summ_true

' The president\'s support is a big win for activists who have been pushing for criminal justice measures that roll back the stiff penalties imposed as a part of America\'s decades-long "war on drugs."'

In [30]:
import time
import collections


PROFILER = collections.Counter()


def fitness(chromosome):
    return cohesion_separation(chromosome, jaccard_similarity, doc)

cv = CountVectorizer(stop_words='english')
sents_lower = tokenize.sent_tokenize(text.lower())
# sents_lower = [t.strip() for t in tokenize.line_tokenize(text.lower())]
vec = cv.fit_transform(sents_lower)
doc = vec.toarray().astype(bool).astype(int)

ratio = 0.05

t0 = time.time()
pop = run_iterations(pop_size=100, summ_len=10, num_sents=len(doc), 
                     func=fitness, lam=0.9, cr=0.5, iterations=1000, verbose=False, seed=0)
t1 = time.time()
print(t1-t0)


chrom_best = best_chromosome(pop)
idxs = central_sentences(chrom_best, doc)
#: 2018 article[999] -- gay skier winter olympics -> no lower() causes wierd sentence breaks that throw off summary
# sents_norm = tokenize.sent_tokenize(text)
summ_evol = '\n'.join(np.array(sents_lower)[idxs])

PROFILER

416.2935118675232


Counter({'offspring': 3.7587873935699463,
         'generation': 410.84064245224,
         'mutate': 1.3425624370574951})

In [36]:
pop = run_iterations(pop_size=100, summ_len=10, num_sents=len(doc), 
                     func=fitness, lam=0.9, cr=0.5, iterations=1000, verbose=False, seed=0)

10
int
10


In [37]:
np.unique(pop)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [38]:
np.unique(chrom_best)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [43]:
[sent.split('\n') for sent in sents_lower]

[['every who down in whoville liked christmas a lot...',
  'but the grinch,who lived just north of whoville, did not!'],
 ['the grinch hated christmas!'],
 ['the whole christmas season!'],
 ["now, please don't ask why."],
 ['no one quite knows the reason.'],
 ["it could be his head wasn't screwed on just right."],
 ['it could be, perhaps, that his shoes were too tight.'],
 ['but i think that the most likely reason of all,',
  'may have been that his heart was two sizes too small.'],
 ['whatever the reason, his heart or his shoes,',
  'he stood there on christmas eve, hating the whos,',
  'staring down from his cave with a sour, grinchy frown,',
  'at the warm lighted windows below in their town.'],
 ['for he knew every who down in whoville beneath,',
  'was busy now, hanging a mistletoe wreath.'],
 ['"and they\'re hanging their stockings!"'],
 ['he snarled with a sneer,', '"tomorrow is christmas!'],
 ['it\'s practically here!"'],
 ['then he growled, with his grinch fingers nervously dr

In [39]:
idxs

[0, 8, 29, 65, 72, 90, 103, 104, 135, 165]

In [41]:
for row in np.array(sents_lower)[idxs]:
    print(row)
    print('-'*100)

every who down in whoville liked christmas a lot...
but the grinch,who lived just north of whoville, did not!
----------------------------------------------------------------------------------------------------
whatever the reason, his heart or his shoes,
he stood there on christmas eve, hating the whos,
staring down from his cave with a sour, grinchy frown,
at the warm lighted windows below in their town.
----------------------------------------------------------------------------------------------------
and they'd feast!
----------------------------------------------------------------------------------------------------
then he loaded some bags and some old empty sacks,
on a ramshackle sleigh and he hitched up old max.
----------------------------------------------------------------------------------------------------
"this is stop number one," the old grinchy claus hissed,
and he climbed to the roof, empty bags in his fist.
-----------------------------------------------------------

In [27]:
# get the uppercase correct
summ_evol = []
for sent in np.array(sents_lower)[idxs]:
    start = text.lower().index(sent)
    stop = start + len(sent)
    summ_evol.append(text[start:stop])
summ_evol = '\n'.join(summ_evol)
print(summ_evol)

Every Who Down in Whoville Liked Christmas a lot...
But the Grinch,Who lived just north of Whoville, Did NOT!
Oh, the noise!
And they'd feast!
And the Grinch grabbed the tree, and he started to shove,
When he heard a small sound like the coo of a dove.
She stared at the Grinch and said, "Santy Claus, why,”
"Why are you taking our Christmas tree?
And the one speck of food That he left in the house,
Was a crumb that was even too small for a mouse.
And the Grinch put his hand to his ear.
And the minute his heart didn't feel quite so tight,
He whizzed with his load through the bright morning light,
And he brought back the toys!


In [31]:
# get the uppercase correct
summ_evol = []
for sent in np.array(sents_lower)[idxs]:
    start = text.lower().index(sent)
    stop = start + len(sent)
    summ_evol.append(text[start:stop])
summ_evol = '\n'.join(summ_evol)
print(summ_evol)

Every Who Down in Whoville Liked Christmas a lot...
But the Grinch,Who lived just north of Whoville, Did NOT!
Whatever the reason, His heart or his shoes,
He stood there on Christmas Eve, hating the Whos,
Staring down from his cave with a sour, Grinchy frown,
At the warm lighted windows below in their town.
And they'd feast!
THEN He loaded some bags And some old empty sacks,
On a ramshackle sleigh And he hitched up old Max.
"This is stop number one," the old Grinchy Claus hissed,
And he climbed to the roof, empty bags in his fist.
Then the Grinch, very nimbly,
Stuffed all the bags, one by one, up the chimney!
The Grinch had been caught by this tiny Who daughter,
Who'd got out of bed for a cup of cold water.
She stared at the Grinch and said, "Santy Claus, why,”
"Why are you taking our Christmas tree?
"That's a noise," grinned the Grinch, "That I simply MUST hear!"
And the minute his heart didn't feel quite so tight,
He whizzed with his load through the bright morning light,
And he brou

In [32]:
np.save('grinch_pop0_len10_lam9_cr_5_iter1000_seed0.npy', chrom_best)

In [14]:
def scores(n, y_pred):
    for i in range(1, n+1):
        score = rouge_n(i, y_pred, summ_true)
        print(i, score)

In [235]:
print('evolutionary')
scores(3, summ_evol)

evolutionary
1 0.8571428571428571
2 0.7222222222222222
3 0.3735632183908046


In [19]:
import gensim

ratio = 0.05
summ_gensim = gensim.summarization.summarize(text, ratio=ratio)
print('gensim')
# scores(3, summ_gensim)

gensim


In [20]:
print(summ_gensim)

Every Who Down in Whoville Liked Christmas a lot...
The Grinch hated Christmas!
He stood there on Christmas Eve, hating the Whos,
The more the Grinch thought, "I must stop this whole thing!"
THE GRINCH GOT A WONDERFUL, AWFUL IDEA!
Did that stop the old Grinch?
On a ramshackle sleigh And he hitched up old Max. Then the Grinch said, "Giddap!" And the sleigh started down,
He took the Whos' feast!
She stared at the Grinch and said, "Santy Claus, why,”


In [1]:
import string

from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.tokenizers import Tokenizer



def format_sumy(sumy_summary):
    return sumy_summary
    return ''.join(char for sent in sumy_summary for char in str(sent) if char in string.printable)

parser = PlaintextParser(text, Tokenizer('english'))

text_rank = TextRankSummarizer()
summ_text_rank = format_sumy(text_rank(parser.document, len(idxs)))
print('sumy - text rank')
# scores(3, summ_text_rank)

print()

lex_rank = LexRankSummarizer()
summ_lex_rank = format_sumy(lex_rank(parser.document, len(idxs)))
print('sumy - lex rank')
# scores(3, summ_lex_rank)

NameError: name 'text' is not defined

In [None]:
print(summ_text_rank)

In [25]:
print(summ_lex_rank)

The Grinch hated Christmas!And then!But, you know, that old Grinch was so smart and so slick, He thought up a lie, and he thought it up quick!And the one speck of food That he left in the house, Was a crumb that was even too small for a mouse."They're finding out now that no Christmas is coming!""That's a noise," grinned the Grinch, "That I simply MUST hear!"And the Grinch put his hand to his ear.But it WAS merry!And the minute his heart didn't feel quite so tight, He whizzed with his load through the bright morning light, And he brought back the toys!
