In [20]:
import functools
import itertools
import logging
import multiprocessing
import pathlib

import numba
import numpy as np

from nltk import tokenize
from nltk.util import ngrams

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import pairwise_distances

In [2]:
@numba.njit
def jaccard_similarity(a, b):
    union = np.sum(a | b)
    if not union:
        return 1.0
    return np.sum(a & b) / union


@numba.njit
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


@numba.njit
def cohesion(chromosome, similarity, document):
    total = 0
    for p in np.unique(chromosome):
        sents = document[chromosome == p]
        k = len(sents)
        #: combinations choose 2
        for i in range(k-1):
            for j in range(i+1, k):
                total += similarity(sents[i], sents[j]) / len(sents)  
    return total


@numba.njit
def separation(chromosome, similarity, document):
    total = 0
    k = len(np.unique(chromosome))
    #: combinations choose 2
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = document[chromosome == p]
            sents_q = document[chromosome == q]
            #: product
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += similarity(sents_p[i], sents_q[j]) / m / n
    return total


@numba.njit
def cohesion_separation(chromosome, similarity, document):
    coh = cohesion(chromosome, similarity, document)
    sep = separation(chromosome, similarity, document)
    return (1 + sigmoid(coh)) ** sep

In [3]:
def init_chromosome(choices, length):
    chrom = np.full(length, -1)
    #: ensure that each choice is accounted for at least once
    idxs = np.random.choice(np.arange(length), len(choices), replace=False)
    chrom[idxs] = np.random.permutation(choices)
    idxs = np.where(chrom == -1)[0]
    chrom[idxs] = np.random.choice(choices, len(idxs))
    return chrom


def init_population(population_size, cluster_amount, chromosome_length):
    clusts = np.arange(cluster_amount)
    chroms = [init_chromosome(clusts, chromosome_length) for _ in range(population_size)]
    pop = np.vstack(chroms)
    return pop


def get_offspring_distinct(population, randoms, lambda_, crossover_rate):
    n = np.arange(len(population))
    s = set(n)
    idxs = np.array([np.random.choice(tuple(s - {i}), size=3, replace=False) for i in n])
    chrom_1, chrom_2, chrom_3 = map(np.squeeze, np.split(population[idxs], 3, axis=1))
    k = len(np.unique(population))
    offspr = (chrom_1 + lambda_ * (chrom_2 - chrom_3)) % k
    mask = randoms < crossover_rate
    offspr[mask] = population[mask]
    return offspr


def get_offspring(population, randoms, lambda_, crossover_rate):
    #: For computation time, relax requirement that X_r, X_r1, X_r2, X_r3 are distinct. 
    #: With large population size, this is unlikely to occur, and if it does, it doesn't
    #: seem that detrimental. Also is this mitigated with appropriate lam choice?
    n = len(population)
    idxs = np.random.choice(np.arange(n), size=(n, 3))
    chrom_1, chrom_2, chrom_3 = map(np.squeeze, np.split(population[idxs], 3, axis=1))
    k = len(np.unique(population))
    offspr = (chrom_1 + lambda_ * (chrom_2 - chrom_3)) % k
    mask = randoms < crossover_rate
    offspr[mask] = population[mask]
    return offspr


def next_generation(population, offspring, func, pool):
    fits = pool.map(func, itertools.chain(population, offspring))
    l = len(population)
    fit_pop = np.array(fits[:l])
    fit_off = np.array(fits[l:])
    mask = fit_off > fit_pop
    population[mask] = offspring[mask]
    return


def mutate(population, randoms):
    mask = randoms < sigmoid(population)
    #: inversion operator
    idxs = np.nonzero(mask)
    arr = np.array(idxs)
    sorter = np.lexsort((-arr[1], arr[0]))
    rev = arr.T[sorter].T
    population[idxs] = population[(rev[0], rev[1])]
    return

In [14]:
#TODO: early stopping --> little fitness improvement over x generations, good enough fitness score
def run_iterations(pop_size, summ_len, num_sents, func, lam, cr, iterations, *, 
                   seed=None, verbose=False, save_rate=np.nan, save_dir=None):
    
    if verbose:
        logging.info(f'pop_size={pop_size}, summ_len={summ_len}, func={func}, lam={lam}, cr={cr}, iterations={iterations}, seed={seed}')
    
    if save_dir is not None:
        save_dir = pathlib.Path(save_dir)
        if not save_dir.is_dir():
            msg = f'save_dir={save_dir} not a valid directory path'.format(save_dir=save_dir)
            raise NotADirectoryError(msg)
    
    if seed is not None:
        np.random.seed(seed)
        
    if isinstance(summ_len, int):
        if not (0 < summ_len < num_sents):
            raise ValueError('int summ_len must be between 0 and the number of sentences in the document')
    elif isinstance(summ_len, float):
        if not (0.0 < summ_len < 1.0):
            raise ValueError('float summ_len must be between 0.0 and 1.0')
        summ_len = int(summ_len * num_sents)
    else:
        raise TypeError('summ_len must be a float or int')
        
    pool = multiprocessing.Pool()
    pop = init_population(pop_size, summ_len, num_sents)
    shape = pop.shape
    for i in range(iterations):
        if i % save_rate == 0:
            file = save_dir / 'generation_{i:0>pad}'.format(i=i, pad=len(str(iterations)))
            np.save(file, pop)
            
        if verbose:
            logging.info(f'iteration: {i}')
            #TODO: logfile --> iteration number, best fitness score, avg fitness score, hyper-params
        
        rand = np.random.random_sample(shape)
        offspr = get_offspring(pop, rand, lam, cr)
        next_generation(pop, offspr, func, pool)
        mutate(pop, rand)

    pool.terminate()
    return pop

In [11]:
def best_chromosome(population):
    #TODO: make sure it picks one with all k-clusters
    fits = np.argmax([fitness(chrom) for chrom in population])
    chrom = population[fits]
    return chrom
    

def central_sentences(chromosome, document, metric=cosine_distances):
    central_sents = []
    for cluster in np.unique(chromosome):
        idxs = np.where(chromosome == cluster)[0]
        sents = document[idxs]
        centroid = sents.mean(axis=0)[np.newaxis,:]
        dists = metric(sents, centroid)
        cent_sent = idxs[np.argmin(dists)]
        central_sents.append(cent_sent)
    return sorted(central_sents)

In [12]:
def rouge_n(n, y_pred, y_true):
    n_gram_pred = set(ngrams(y_pred, n))
    n_gram_true = set(ngrams(y_true, n))
    return len(n_gram_pred & n_gram_true) / len(n_gram_true)

# Run Algorithm

In [15]:
import pathlib

cwd = pathlib.Path.cwd()
path = cwd / 'data' / 'grinch.txt'
with open(path) as fp:
    text = fp.read()
    
logfile = cwd / f'{path.stem}.log'
logfile.touch()
fmt = '{name} - {asctime} - {levelname} : {message}'
logging.basicConfig(filename=logfile, level=logging.INFO, style='{', format=fmt)

text[:100]

'Every Who Down in Whoville Liked Christmas a lot...\nBut the Grinch,Who lived just north of Whoville,'

In [16]:
def fitness(chromosome):
    return cohesion_separation(chromosome, jaccard_similarity, doc)

cv = CountVectorizer(stop_words='english')
sents_lower = tokenize.sent_tokenize(text.lower())
sents_lower = (sent.split('\n') for sent in sents_lower)
sents_lower = tuple(itertools.chain.from_iterable(sents_lower))
vec = cv.fit_transform(sents_lower)
doc = vec.toarray().astype(bool).astype(int)
ratio = 0.05

logging.info('started iterations')
pop = run_iterations(pop_size=100, summ_len=ratio, num_sents=len(doc), 
                     func=fitness, lam=0.5, cr=0.5, iterations=1000, verbose=True, seed=0)

logging.info('finished iterations')

In [29]:
def retrieve_orig(idxs):
    summ_evol = []
    for sent in np.array(sents_lower)[idxs]:
        start = text.lower().index(sent)
        stop = start + len(sent)
        summ_evol.append(text[start:stop])
    summ_evol = '\n'.join(summ_evol)
    return summ_evol

In [30]:
chrom_best = best_chromosome(pop)
np.save(f'{path.stem}', chrom_best)
logging.info('saved npy')

pair_dist = functools.partial(pairwise_distances, metric='cosine')
idxs = central_sentences(chrom_best, doc, pair_dist)
summ_evol = retrieve_orig(idxs)
for i, sent in enumerate(summ_evol.split('\n'), start=0):
    print(i, sent)

0 The Grinch hated Christmas!
1 Then the Whos, young and old, would sit down to a feast.
2 And the more the Grinch thought of this Who ChristmasSing,
3 "I MUST stop this Christmas from coming!
4 THE GRINCH GOT A WONDERFUL, AWFUL IDEA!
5 Toward the homes where the Whos Lay asnooze in their town.
6 When he heard a small sound like the coo of a dove.
7 She stared at the Grinch and said, "Santy Claus, why,”
8 That the Grinch's small heart Grew three sizes that day!
9 And the minute his heart didn't feel quite so tight,


In [None]:
# [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’,
# ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’]

In [23]:
import gensim
gen_summ = gensim.summarization.summarize(text, ratio=ratio)
for i, sent in enumerate(gen_summ.split('\n'), start=1):
    print(i, sent)

1 Every Who Down in Whoville Liked Christmas a lot...
2 The Grinch hated Christmas!
3 He stood there on Christmas Eve, hating the Whos,
4 The more the Grinch thought, "I must stop this whole thing!"
5 THE GRINCH GOT A WONDERFUL, AWFUL IDEA!
6 Did that stop the old Grinch?
7 On a ramshackle sleigh And he hitched up old Max. Then the Grinch said, "Giddap!" And the sleigh started down,
8 He took the Whos' feast!
9 She stared at the Grinch and said, "Santy Claus, why,”


In [24]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.nlp.tokenizers import Tokenizer

In [25]:
parser = PlaintextParser(text, Tokenizer('english'))

In [26]:
lex_rank = LexRankSummarizer()
summ = lex_rank(parser.document, 15)
print(summ)

(<Sentence: The Grinch hated Christmas!>, <Sentence: And then!>, <Sentence: Then the Whos, young and old, would sit down to a feast.>, <Sentence: Then the Grinch, very nimbly, Stuffed all the bags, one by one, up the chimney!>, <Sentence: But, you know, that old Grinch was so smart and so slick, He thought up a lie, and he thought it up quick!>, <Sentence: And the one speck of food That he left in the house, Was a crumb that was even too small for a mouse.>, <Sentence: "They're finding out now that no Christmas is coming!">, <Sentence: "They're just waking up!>, <Sentence: I know just what they'll do!">, <Sentence: "That's a noise," grinned the Grinch, "That I simply MUST hear!">, <Sentence: And the Grinch put his hand to his ear.>, <Sentence: It couldn't be so!>, <Sentence: But it WAS merry!>, <Sentence: And the Grinch, with his grinch-feet ice-cold in the snow, Stood puzzling and puzzling: "How could it be so?">, <Sentence: And the minute his heart didn't feel quite so tight, He whiz

In [27]:
text_rank = TextRankSummarizer()
summ = text_rank(parser.document, 15)
print(summ)

(<Sentence: Whatever the reason, His heart or his shoes, He stood there on Christmas Eve, hating the Whos, Staring down from his cave with a sour, Grinchy frown, At the warm lighted windows below in their town.>, <Sentence: And the more the Grinch thought of this Who ChristmasSing, The more the Grinch thought, "I must stop this whole thing!">, <Sentence: Then he took some red thread, And he tied a big horn on the top of his head.>, <Sentence: And the sleigh started down, Toward the homes where the Whos Lay asnooze in their town.>, <Sentence: "This is stop number one," the old Grinchy Claus hissed, And he climbed to the roof, empty bags in his fist.>, <Sentence: Then he slithered and slunk, with a smile most unpleasant, Around the whole room, and he took every present!>, <Sentence: Then the Grinch, very nimbly, Stuffed all the bags, one by one, up the chimney!>, <Sentence: And the Grinch grabbed the tree, and he started to shove, When he heard a small sound like the coo of a dove.>, <Se