In [1]:
import itertools
import json
import pathlib

import numba
import numpy as np
from nltk import tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
cwd = pathlib.Path.cwd()
data = cwd / 'data'
jsons = data / 'jsons'
json_2018 = jsons / '2018' / '2018.json'

with open(json_2018) as fp:
    articles_2018 = json.load(fp)['2018']

article = articles_2018[4]
text = article['story']

In [3]:
def jaccard_sim(a, b):
    return np.sum(a & b) / np.sum(a | b)


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def cohesion(chromosome, sim, doc):
    total = 0
    for p in np.unique(chromosome):
        cluster_p = doc[np.where(chromosome == p)]
        for sent_i, sent_j in itertools.combinations(cluster_p, r=2):
            total += sim(sent_i, sent_j) / len(cluster_p)
    return total


def separation(chromosome, sim, doc):
    total = 0
    for p, q in itertools.combinations(np.unique(chromosome), r=2):
        cluster_p = doc[np.where(chromosome == p)]
        cluster_q = doc[np.where(chromosome == q)]
        for sent_i, sent_j in itertools.product(cluster_p, cluster_q):
            total += sim(sent_i, sent_j) / len(cluster_p) / len(cluster_q)
    return total

def cohesion_separation(chromosome, sim, doc):
    coh = cohesion(chromosome, sim, doc)
    sep = separation(chromosome, sim, doc)
    return pow(1 + sigmoid(coh), sep)

In [22]:
@numba.njit(cache=True)
def jaccard_sim3(a, b):
    #: assume union is non-empty since each sentence >= 1 word
    return np.sum(a & b) / np.sum(a | b)

In [33]:
@numba.njit(cache=True)
def cohesion3(chromosome, doc):
    total = 0
    for p in np.unique(chromosome):
        sents = doc[np.where(chromosome == p)]
        k = len(sents)
        #: combinations choose 2
        for i in range(k-1):
            for j in range(i+1, k):
                total += jaccard_sim3(sents[i], sents[j]) / len(sents)  
    return total

In [35]:
%timeit cohesion3(chrom, doc)

7.02 µs ± 56.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [36]:
%timeit cohesion2(chrom, jaccard_sim2, doc)

16.3 µs ± 102 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [37]:
@numba.njit(cache=True)
def separation3(chromosome, doc):
    total = 0
    k = len(np.unique(chromosome))
    #: combinations choose 2
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[np.where(chromosome == p)]
            sents_q = doc[np.where(chromosome == q)]
            #: product
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += jaccard_sim3(sents_p[i], sents_q[j]) / m / n
    return total

In [42]:
%timeit separation3(chrom, doc)

22.8 µs ± 205 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [40]:
%timeit separation2(chrom, jaccard_sim2, doc)

39.8 µs ± 15.6 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
@numba.njit
def jaccard_sim2(a, b):
    #: assume union is non-empty since each sentence >= 1 word
    return np.sum(a & b) / np.sum(a | b)


@numba.njit
def sigmoid2(x):
    return 1 / (1 + np.exp(-x))


@numba.njit
def cohesion2(chromosome, sim, doc):
    total = 0
    for p in np.unique(chromosome):
        sents = doc[np.where(chromosome == p)]
        k = len(sents)
        #: combinations choose 2
        for i in range(k-1):
            for j in range(i+1, k):
                total += sim(sents[i], sents[j]) / len(sents)  
    return total


@numba.njit
def separation2(chromosome, sim, doc):
    total = 0
    k = len(np.unique(chromosome))
    #: combinations choose 2
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[np.where(chromosome == p)]
            sents_q = doc[np.where(chromosome == q)]
            #: product
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += sim(sents_p[i], sents_q[j]) / m / n
    return total


@numba.njit
def cohesion_separation2(chromosome, sim, doc):
    coh = cohesion2(chromosome, sim, doc)
    sep = separation2(chromosome, sim, doc)
    return (1 + sigmoid2(coh)) ** sep

In [9]:
cv = CountVectorizer(stop_words='english')
sents = tokenize.sent_tokenize(text.lower())
vec = cv.fit_transform(sents)

doc = vec.toarray().astype(bool).astype(int)
chrom = np.array([0, 2, 1, 1, 3, 2, 4, 1])

assert cohesion2(chrom, jaccard_sim2, doc) == cohesion(chrom, jaccard_sim, doc)
assert separation2(chrom, jaccard_sim2, doc) == separation(chrom, jaccard_sim, doc)
assert cohesion_separation2(chrom, jaccard_sim2, doc) == cohesion_separation(chrom, jaccard_sim, doc)

In [15]:
%timeit cohesion_separation2(chrom, jaccard_sim2, doc)

41.5 µs ± 1.04 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [16]:
%timeit cohesion_separation(chrom, jaccard_sim, doc)

436 µs ± 12.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
%timeit cohesion2(chrom, jaccard_sim2, doc)

16.9 µs ± 252 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [41]:
%timeit separation2(chrom, jaccard_sim2, doc)

32.4 µs ± 280 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [21]:
%timeit np.unique(chrom)

4.61 µs ± 33.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [24]:
%timeit doc[np.where(chrom == 4)]

2.63 µs ± 39.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [50]:
%timeit doc[np.nonzero(chrom)]

2.7 µs ± 50.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [44]:
@numba.njit
def foo(chrom, doc):
    for p in np.unique(chrom):
        sents = doc[np.where(chrom == p)]

In [46]:
%timeit foo(chrom, doc)

5.4 µs ± 72.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [47]:
@numba.njit
def bar(chromosome, doc):
    total = 0
    k = len(np.unique(chromosome))
    #: combinations choose 2
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[np.where(chromosome == p)]
            sents_q = doc[np.where(chromosome == q)]

In [49]:
%timeit bar(chrom, doc)

13.4 µs ± 52.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
