In [190]:
import collections
import functools
import itertools
import logging
import multiprocessing
import pathlib

from nltk.tokenize import sent_tokenize
from numba import njit
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances


@njit
def jaccard(a, b):
    union = np.sum(a | b)
    if not union:
        return 1.0
    return np.sum(a & b) / union


@njit
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


class DdeSummarizer:
    """Discrete differential evolution (DDE) extractive text summarizer.
        
    This implementation is derived from:
    [1] R. Alguliev, R. Aliguliyev "Evolutionary Algorithm for Extractive Text Summarization" 2009
    [2] S. Das, P.N. Suganthan "Differential Evolution: A Survey of the State-of-the-Art" 2011
    [3] A. Abuobieda, N. Salim, Y.J. Kumar, A.H. Osman "An Improved Evolutionary Algorithm for Extractive Text Summarization" 2013
    [4] S. Karwa, N. Chatterjee "Discrete Differential Evolution for Text Summarization" 2014

    Parameters
    ----------
    pop_size : int, (default=100)
        The population size to create offspring and mutate over.

    max_iter : int, (default=1000)
        The maximum number of generations to evolve.

    summ_ratio : float, (default=0.1)
        The compression ratio for the summary, with 0 <= summ_ratio <= 1.

    lam : float, (default=0.5)
        The scale factor used with DDE, with 0 <= lam <= 1.

    crossover : float, (default=0.5)
        The crossover rate used for offspring, with 0 <= summ_ratio <= 1.

    fitness : str, 'coh_sep', 'coh', or 'sep', (default='coh_sep')
        The fitness function used to determine which chromosomes make it to the
        next generation. The 'coh' fitness maximizes similarity within a given
        cluster. The 'sep' fitness minimizes similarity between different
        clusters. The 'coh_sep' is a balance of the former two.

    similarity : callable, (default=jaccard)
        Similarity function for comparing two arrays. Needs to be able to work
        with numba.

    metric : str, or callable, (default='cosine')
        Metric used to select central sentences from clusters when finished with
        iterating through generations.
        See sklearn.metrics.pairwise_distances for details.

    tokenizer : callable, (default=nltk.tokenize.sent_tokenize)
        Tokenizer used to split text when fit.

    stop_words : str, list, or None (default=None):
        Words to remove from document.
        See sklearn.feature_extraction.text.CountVectorizer for details.

    n_jobs : int, (default=1)
        The number of CPUs to score fitness of each chromosome in the population
        at each generation. -1 means usings all processors.

    early_stopping : bool, (default=False)
        Whether to use early stopping to terminate iterations when fitness score
        is not improving.

    n_iter_no_change : int, (default=5)
        Number of iterations with no improvement to wait before early stopping.

    tol : float, (default=1e-3)
        The stopping criterion.

    random_state : int, (default=None)
        The seed of the pseudo random number generator to use when evolving To
        be passed to np.random.seed.

    verbose : int, (default=0)
        The verbosity level.
    """

    def __init__(self, pop_size=100, max_iter=1000, summ_ratio=0.1, lam=0.5, crossover=0.5,
                 fitness='coh_sep', similarity=jaccard, metric='cosine', tokenizer=sent_tokenize,
                 stop_words=None, n_jobs=1, early_stopping=False, n_iter_no_change=5,
                 tol=1e-3, random_state=None, verbose=0):

        self.pop_size = int(pop_size)
        self.max_iter = int(max_iter)
        self.n_jobs = int(n_jobs)
        self.early_stopping = bool(early_stopping)
        self.n_iter_no_change = int(n_iter_no_change) if (n_iter_no_change > 1) else 1
        self.tol = float(tol)
        self.random_state = random_state
        self.verbose = max(0, int(verbose))
        self.stop_words = stop_words
        self.tokenizer = tokenizer
        self.metric = str(metric).lower()

        self._pop = None
        self._offspr = None
        self._rand = None

        funcs = dict(coh_sep=self._cohesion_separation, coh=self._cohesion, sep=self._separation)
        self.fitness = funcs[fitness.lower()]

        if (summ_ratio < 0) or (summ_ratio > 1):
            raise ValueError('summ_ratio not in interval [0, 1]')
        self.summ_ratio = float(summ_ratio)

        if (lam < 0) or (lam > 1):
            raise ValueError('lam not in interval [0, 1]')
        self.lam = float(lam)

        if (crossover < 0) or (crossover > 1):
            raise ValueError('crossover not in interval [0, 1]')
        self.crossover = float(crossover)

        #TODO: use inspect.signature to see if it takes 2 inputs?
        if not callable(similarity):
            raise ValueError('similarity must be callable')
        self.similarity = similarity

    def __repr__(self):
        fitness = '_'.join(fit[:3] for fit in self.fitness.__name__.split('_') if fit)
        return (f'{type(self).__name__}(pop_size={self.pop_size}, max_iter={self.max_iter}, '
                f'summ_ratio={self.summ_ratio}, lam={self.lam}, crossover={self.crossover}, '
                f'fitness={fitness!r}, similarity={self.similarity.__name__}, metric={self.metric!r}, '
                f'tokenizer={self.tokenizer.__name__}, stop_words={self.stop_words!r}, n_jobs={self.n_jobs}, '
                f'early_stopping={self.early_stopping}, n_iter_no_change={self.n_iter_no_change}, '
                f'tol={self.tol}, random_state={self.random_state}, verbose={self.verbose})')

    def fit(self, text):
        """Fit text to model."""
        self.text = str(text)
        self._tokens = self.tokenizer(self.text.lower())
        count_vec = CountVectorizer(stop_words=self.stop_words).fit_transform(self._tokens)
        #: numba does not support sparse matrices
        self._document = count_vec.toarray().astype(bool)
        self._summ_len = int(self.summ_ratio * self._document.shape[1]) or 1        

    def summarize(self):
        """Create extractive summary using DDE."""
        np.random.seed(self.random_state)

        if self.verbose:
            logging.debug(repr(self))
            logging.info(self.text)
            logging.debug('random seed: {}'.format(self.random_state))
            if self.verbose >= 2:
                logging.info('random state: {}'.format(np.random.get_state()))

        processes = self.n_jobs if (self.n_jobs >= 1) else None
        pool = multiprocessing.Pool(processes)
        n_iter_deque = collections.deque([np.nan] * self.n_iter_no_change, maxlen=self.n_iter_no_change)
        self._pop = np.array([self._init_chrom() for _ in range(self.pop_size)])
        
        for i in range(self.max_iter):
            self._rand = np.random.random_sample(self._pop.shape)
            self._offspring()
            self._survival(pool)
            self._mutate()
            
            if self.verbose:
                logging.debug('iteration: {}'.format(i))
                if self.verbose >= 2:
                    logging.debug('best fit: {}'.format(self._best_fit))

            if self.early_stopping:
                n_iter_deque.append(self._best_fit)
                if max(n_iter_deque) - min(n_iter_deque) < self.tol:
                    break

        pool.terminate()
        self.n_iter_ = i + 1
        idx = np.argmax([self.fitness(chrom) for chrom in self._pop])
        self.best_chrom_ = self._pop[idx]
        self._build_summ()

    def _init_chrom(self):
        clusters = np.arange(self._summ_len)
        chrom = np.full(len(self._document), -1)
        #: ensure that each cluster is accounted for at least once
        idxs = np.random.choice(np.arange(len(chrom)), self._summ_len, replace=False)
        chrom[idxs] = np.random.permutation(clusters)
        #: fill rest randomly
        idxs = (chrom == -1)
        chrom[idxs] = np.random.choice(clusters, np.sum(idxs))
        return chrom

    def _offspring(self):
        n = np.arange(len(self._pop))
        s = frozenset(n)
        #: get 3 distinct chromosomes that differ from i_th chromosome
        idxs = np.array([np.random.choice(tuple(s - {i}), size=3, replace=False) for i in n])
        chrom_1, chrom_2, chrom_3 = map(np.squeeze, np.split(self._pop[idxs], 3, axis=1))
        #: discrete differential evolution
        self._offspr = (chrom_1 + self.lam * (chrom_2 - chrom_3)) % self._summ_len
        mask = self._rand < self.crossover
        self._offspr[mask] = self._pop[mask]
        return

    def _survival(self, pool):
        fits = pool.map(self.fitness, itertools.chain(self._pop, self._offspr))
        self._best_fit = max(fits)  # used for early stopping
        i = len(self._pop)
        fit_pop, fit_off = fits[:i], fits[i:]
        mask = fit_off > fit_pop
        self._pop[mask] = self._offspr[mask]
        return

    def _mutate(self):
        mask = self._rand < sigmoid(self._pop)
        #: inversion operator -> for each row reverse order of all True values
        idxs = np.nonzero(mask)
        arr = np.array(idxs)
        sorter = np.lexsort((-arr[1], arr[0]))
        rev = arr.T[sorter].T
        self._pop[idxs] = self._pop[(rev[0], rev[1])]
        return

    def _central_sents(self):
        central_idxs = []
        for cluster in np.unique(self.best_chrom_):
            idxs = np.where(self.best_chrom_ == cluster)[0]
            sents = self._document[idxs]
            centroid = sents.mean(axis=0)[np.newaxis,:]
            dists = pairwise_distances(sents, centroid, self.metric)
            cent_sent = idxs[np.argmin(dists)]
            central_idxs.append(cent_sent)
        return sorted(central_idxs)

    def _build_summ(self):
        central = self._central_sents()
        summ = []
        for sent in np.array(self._tokens)[central]:
            start = self.text.lower().index(sent)
            stop = start + len(sent)
            summ.append(self.text[start:stop])
        self.summary_ = '\n'.join(summ)
        return
    
    def _cohesion(self, chrom):
        return _cohesion(chrom, self._document, self.similarity)
    
    def _separation(self, chrom):
        return 1 / _separation(chrom, self._document, self.similarity)
    
    def _cohesion_separation(self, chrom):
        return _cohesion_separation(chrom, self._document, self.similarity)


#: numba doesn't work in class
@njit
def _cohesion(chrom, doc, sim):
    total = 0
    for p in np.unique(chrom):
        sents = doc[chrom == p]
        k = len(sents)
        #: itertools.combinations(sents, r=2)
        for i in range(k-1):
            for j in range(i+1, k):
                total += sim(sents[i], sents[j]) / len(sents)
    return total

@njit
def _separation(chrom, doc, sim):
    total = 0
    k = len(np.unique(chrom))
    #: itertools.combinations(k, r=2)
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[chrom == p]
            sents_q = doc[chrom == q]
            #: itertools.product(sents_p, sents_q)
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += sim(sents_p[i], sents_q[j]) / m / n
    return total

@njit
def _cohesion_separation(chrom, doc, sim):
    coh = _cohesion(chrom, doc, sim)
    sep = _separation(chrom, doc, sim)
    return (1 + sigmoid(coh)) ** sep

In [192]:
cwd = pathlib.Path.cwd()
data = cwd / 'data'
txts = data / 'txts'

tell_tale_heart_txt = txts / 'tell_tale_heart.txt'
with open(tell_tale_heart_txt) as fp:
    text = fp.read()

text[:100]

'True! --nervous --very, very dreadfully nervous I had been and am; but why will you say that I am ma'

In [198]:
dde_summ = DdeSummarizer(pop_size=50, max_iter=500, summ_ratio=0.05, stop_words='english', 
                         n_jobs=-1, random_state=0, early_stopping=True)
dde_summ.fit(text)
dde_summ.summarize()

In [199]:
print(dde_summ.summary_)

Whenever it fell upon me, my blood ran cold; and so by degrees --very gradually --I made up my mind to take the life of the old man, and thus rid myself of the eye forever.
I was never kinder to the old man than during the whole week before I killed him.
And then, when I had made an opening sufficient for my head, I put in a dark lantern, all closed, closed, that no light shone out, and then I thrust in my head.
It took me an hour to place my whole head within the opening so far that I could see him as he lay upon his bed.
would a madman have been so wise as this, And then, when my head was well in the room, I undid the lantern cautiously-oh, so cautiously --cautiously (for the hinges creaked) --I undid it just so much that a single thin ray fell upon the vulture eye.
And every morning, when the day broke, I went boldly into the chamber, and spoke courageously to him, calling him by name in a hearty tone, and inquiring how he has passed the night.
His room was as black as pitch with th

In [200]:
dde_summ.n_iter_

500

In [201]:
dde_summ.best_chrom_

array([22,  6, 18, 12, 11,  4,  9, 11, 11,  9, 15,  1, 21, 22, 16,  1,  7,
        6,  1,  3, 12, 18,  3,  0, 18, 16, 17, 23,  1,  5, 19, 14, 13, 14,
        2,  3, 13, 16, 23,  2, 20,  2,  7, 11,  8, 23,  6, 13, 11, 13, 19,
        9,  9,  9, 15, 14, 17,  0, 11, 20,  8,  6,  1,  4, 20,  9,  7,  3,
       10, 18,  9,  4, 20,  1, 12,  6,  0, 18,  9, 17, 14,  1,  8, 16,  8,
       10, 20,  6,  2, 12, 12,  4, 17,  5, 11,  1, 17,  4,  9,  9,  3, 19,
       21,  7,  0, 12,  9,  6,  1, 20,  3,  1, 16,  4,  5,  6, 15, 21, 16,
       23, 23, 21,  0, 10, 20, 20,  2, 22, 22, 11, 10,  7,  2, 10, 17, 13,
       17, 22,  8, 10, 21, 13,  4,  0,  7, 19,  6,  9,  0, 16,  8, 23, 19,
        8, 21, 11, 15, 19,  7,  4,  5,  0, 12,  6, 21,  4, 21, 21])