In [46]:
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer

def init_doc(text):
    text = str(text)
    tokens = sent_tokenize(text.lower())
    count_vec = CountVectorizer(stop_words='english').fit_transform(tokens)
    #: numba does not support sparse matrices; dtype bool to emulate sets
    doc = count_vec.toarray().astype(bool)
    return doc

def init_chrom(doc, k):
    #: make a chromosome that is a random partition with each cluster.
    clusters = k
    chrom = np.full(len(doc), -1)
    #: ensure that each cluster is accounted for at least once
    idxs = np.random.choice(np.arange(len(chrom)), k, replace=False)
    chrom[idxs] = np.random.permutation(clusters)
    #: fill rest randomly
    idxs = (chrom == -1)
    chrom[idxs] = np.random.choice(clusters, np.sum(idxs))
    return chrom

def init_pop(doc, n, k):
    pop = np.array([init_chrom(doc, k) for _ in range(n)])
    return pop

with open('cask_of_amontillado.txt') as fp:
    text = fp.read()

size = 200
doc = init_doc(text)
pop = init_pop(doc, size, 10).astype(np.uint16)
chrom = pop[0]

In [53]:
import numpy as np
import numba as nb


@nb.njit(nb.float32(nb.boolean[::1], nb.boolean[::1]), fastmath=True)
def jaccard_old(a, b):
    """Return the ratio of the intersection to the union of two containers."""
    union = np.sum(a | b)
    if not union:
        result = 1.0
    else:
        result = np.sum(a & b) / union
    return result
        
        
# @nb.njit(nb.float32(nb.float32), fastmath=True)
@nb.njit(fastmath=True)
def sigmoid_old(x):
    """Sigmoid function defined as 1 / (1 + exp(-x))."""
    return 1 / (1 + np.exp(-x))


# make as list of uint... type signature variations maybe?
@nb.njit(nb.float32(nb.uint16[::1], nb.boolean[:, ::1]), fastmath=True)
def cohesion_old(chrom, doc):
    """Measure of how compact all the clusters are."""
    total = 0
    for p in np.unique(chrom):
        sents = doc[chrom == p]
        k = len(sents)
        #: itertools.combinations(sents, r=2) for numba
        for i in range(k-1):
            for j in range(i+1, k):
                total += jaccard_old(sents[i], sents[j]) / k
    return total


@nb.njit(nb.float32(nb.uint16[::1], nb.boolean[:, ::1]), fastmath=True)
def separation_old(chrom, doc):
    """Measure of how separable all the clusters are."""
    total = 0
    k = len(np.unique(chrom))
    #: itertools.combinations(..., r=2) for numba
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[chrom == p]
            sents_q = doc[chrom == q]
            #: itertools.product(sents_p, sents_q) for numba
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += jaccard_old(sents_p[i], sents_q[j]) / m / n
    return total


@nb.jit(nb.float32(nb.uint16[::1], nb.boolean[:, ::1]), fastmath=True)
def cohesion_separation_old(chrom, doc):
    """Measure balancing both cohesion and separation of clusters."""
    coh = cohesion_old(chrom, doc)
    sep = separation_old(chrom, doc)
    return (1 + sigmoid_old(coh)) ** sep


In [48]:
prange = nb.prange


target = 'parallel'


@nb.njit(nb.float32(nb.boolean[::1], nb.boolean[::1]), fastmath=True)
def jaccard(a, b):
    """Return the ratio of the intersection to the union of two containers."""
    union = np.sum(a | b)
    if not union:
        result = 1.0
    else:
        result = np.sum(a & b) / union
    return result


@nb.njit(nb.float32(nb.float32), fastmath=True)
def sigmoid_old(x):
    """Sigmoid function defined as 1 / (1 + exp(-x))."""
    return 1 / (1 + np.exp(-x))



@nb.guvectorize(['void(uint16[::1], boolean[:, ::1], float32[::1])'], '(n),(n,m)->()', 
                target=target, fastmath=True)
def cohesion(chrom, doc, total):
    """Measure of how compact all the clusters are."""
    total[0] = 0.0
    for p in np.unique(chrom):
        sents = doc[chrom == p]
        k = len(sents)
        #: itertools.combinations(sents, r=2) for numba
        for i in range(k-1):
            for j in range(i+1, k):
                total[0] += jaccard(sents[i], sents[j]) / k


@nb.guvectorize(['void(uint16[::1], boolean[:, ::1], float32[::1])'], '(n),(n,m)->()', 
                target=target, fastmath=True)
def separation(chrom, doc, total):
    """Measure of how separable all the clusters are."""
    total[0] = 0.0
    k = len(np.unique(chrom))
    #: itertools.combinations(..., r=2) for numba
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[chrom == p]
            sents_q = doc[chrom == q]
            #: itertools.product(sents_p, sents_q) for numba
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total[0] += jaccard(sents_p[i], sents_q[j]) / m / n
                    
################
# -- PRANGE -- #
################
                    

# @nb.jit#(nb.float32(nb.uint16[::1], nb.boolean[:, ::1]))
def cohesion_separation(chroms, doc):
    """Measure balancing both cohesion and separation of clusters."""
    coh = cohesion(chroms, doc)
    sep = separation(chroms, doc)
    return (1 + sigmoid(coh)) ** sep

In [49]:
from time import time
from functools import partial
import multiprocessing as mp

def timer(func):
    def wrapper(*args, **kwargs):
        t0 = time()
        res = func(*args, **kwargs)
        t1 = time()
        print(f'{t1 - t0:.6f} seconds -- {func}')
        return res
    return wrapper


def separation_pool(pair):
#     return separation_old(pair[0], pair[1])
    return separation_old(pair, doc)


def cohesion_pool(pair):
#     return cohesion_old(pair[0], pair[1])
    return cohesion_old(pair, doc)


def cohesion_separation_pool(pair):
#     return cohesion_separation_old(pair[0], pair[1])
    return cohesion_separation_old(pair, doc)


def test_fitness_func(name):
    func = globals()[name]
    func_old = globals()[name+'_old']
    func_pool = globals()[name+'_pool']
    
    t0 = time()
    score = func(pop, doc)
    t1 = time()
    print(f'{t1 - t0:.6f} seconds -- {func}')
    
    t0 = time()
    score_old = [func_old(chrom, doc) for chrom in pop]
    t1 = time()
    print(f'{t1 - t0:.6f} seconds -- {func_old}')
    
    t0 = time()
    with mp.Pool() as pool:
        score_pool = pool.map(func_pool, pop)
    t1 = time()
    print(f'{t1 - t0:.6f} seconds -- {func_pool}')

    print(np.allclose(score, score_old))

In [50]:
test_fitness_func('cohesion')

0.211677 seconds -- <ufunc 'cohesion'>
0.426388 seconds -- CPUDispatcher(<function cohesion_old at 0x115f859d8>)
0.229431 seconds -- <function cohesion_pool at 0x1174efe18>
True


In [51]:
test_fitness_func('separation')

1.960151 seconds -- <ufunc 'separation'>
3.682992 seconds -- CPUDispatcher(<function separation_old at 0x11b554158>)
1.769362 seconds -- <function separation_pool at 0x1174efbf8>
True


In [52]:
test_fitness_func('cohesion_separation')

2.024051 seconds -- <function cohesion_separation at 0x11c779d90>
4.056772 seconds -- CPUDispatcher(<function cohesion_separation_old at 0x11c4528c8>)
1.979354 seconds -- <function cohesion_separation_pool at 0x11b1b27b8>
True


In [21]:
t0 = time()
cohesion_separation(pop, doc)
time() - t0

2.036125898361206

In [22]:
pool = mp.Pool() 

t0 = time()
with pool as p:
    score_pool = p.map(cohesion_separation_pool, pop)
time() - t0

1.9543039798736572