In [1]:
import itertools
import json
import pathlib

import numba
import numpy as np
from nltk import tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
cwd = pathlib.Path.cwd()
data = cwd / 'data'
jsons = data / 'jsons'
json_2018 = jsons / '2018' / '2018.json'

with open(json_2018) as fp:
    articles_2018 = json.load(fp)['2018']

article = articles_2018[4]
text = article['story']

cv = CountVectorizer(stop_words='english')
sents = tokenize.sent_tokenize(text.lower())
vec = cv.fit_transform(sents)

doc = vec.toarray().astype(bool).astype(int)
chrom = np.array([0, 2, 1, 1, 3, 2, 4, 1])

chrom = np.random.randint(0, 5, size=len(doc))
chrom

array([1, 2, 0, 1, 0, 4, 4, 0, 2, 1, 2, 2, 0, 0, 1, 0, 1, 0, 4, 1, 1, 0,
       2, 0, 1, 3, 4, 0, 4, 3, 1, 3, 3, 4, 0, 0, 1, 3])

# Original Ver

In [9]:
def jaccard_sim(a, b):
    return np.sum(a & b) / np.sum(a | b)


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def cohesion(chromosome, sim, doc):
    total = 0
    for p in np.unique(chromosome):
        cluster_p = doc[np.where(chromosome == p)]
        for sent_i, sent_j in itertools.combinations(cluster_p, r=2):
            total += sim(sent_i, sent_j) / len(cluster_p)
    return total


def separation(chromosome, sim, doc):
    total = 0
    for p, q in itertools.combinations(np.unique(chromosome), r=2):
        cluster_p = doc[np.where(chromosome == p)]
        cluster_q = doc[np.where(chromosome == q)]
        for sent_i, sent_j in itertools.product(cluster_p, cluster_q):
            total += sim(sent_i, sent_j) / len(cluster_p) / len(cluster_q)
    return total

def cohesion_separation(chromosome, sim, doc):
    coh = cohesion(chromosome, sim, doc)
    sep = separation(chromosome, sim, doc)
    return pow(1 + sigmoid(coh), sep)

In [10]:
@numba.njit(cache=True)
def jaccard_sim3(a, b):
    #: assume union is non-empty since each sentence >= 1 word
    return np.sum(a & b) / np.sum(a | b)

# Find Bottlenecks

In [65]:
@numba.njit
def jacc(a, b):
    return np.sum(a & b) / np.sum(a | b)


@numba.njit
def separation0(chromosome, sim, doc):
    total = 0
    k = len(np.unique(chromosome))
    for p in range(k-1):
        for q in range(p+1, k):
            pass
    return total


@numba.njit
def separation1(chromosome, sim, doc):
    total = 0
    k = len(np.unique(chromosome))
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[np.where(chromosome == p)]
            sents_q = doc[np.where(chromosome == q)]
    return total


@numba.njit
def separation2(chromosome, sim, doc):
    total = 0
    k = len(np.unique(chromosome))
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[np.where(chromosome == p)]
            sents_q = doc[np.where(chromosome == q)]
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    pass
    return total


@numba.njit
def separation3(chromosome, sim, doc):
    total = 0
    k = len(np.unique(chromosome))
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[np.where(chromosome == p)]
            sents_q = doc[np.where(chromosome == q)]
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += sim(sents_p[i], sents_q[j]) / m / n
    return total



# doesn't seem to help consistently
@numba.njit(cache=True)
def get_sentences(doc, chrom, k):
    return doc[chrom == k]


@numba.njit
def separation4(chromosome, sim, doc):
    total = 0
    k = len(np.unique(chromosome))
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = get_sentences(doc, chrom, p)
            sents_q = get_sentences(doc, chrom, q)
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += sim(sents_p[i], sents_q[j]) / m / n
    return total


@numba.njit
def separation5(chromosome, sim, doc):
    total = 0
    k = len(np.unique(chromosome))
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[chromosome == p]
            sents_q = doc[chromosome == q]
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += sim(sents_p[i], sents_q[j]) / m / n
    return total


@numba.njit
def separation6(chromosome, sim, doc, k):
    total = 0
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[chromosome == p]
            sents_q = doc[chromosome == q]
            m, n = len(sents_p), len(sents_q)
            print(m, n)
            for i in range(m):
                for j in range(n):
                    total += sim(sents_p[i], sents_q[j]) / m / n
    return total

In [38]:
%timeit separation0(chrom, jacc, doc)

12.2 µs ± 366 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [39]:
%timeit separation1(chrom, jacc, doc)

43.7 µs ± 15 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
%timeit separation2(chrom, jacc, doc)

40.1 µs ± 16.8 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [46]:
%timeit separation3(chrom, jacc, doc)

255 µs ± 3.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [50]:
%timeit separation4(chrom, jacc, doc)

253 µs ± 2.09 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [63]:
%timeit separation5(chrom, jacc, doc)

253 µs ± 2.36 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [66]:
k = len(np.unique(chrom))

%timeit separation6(chrom, jacc, doc, k)

12 10
12 5
12 5
12 6
10 5
10 5
10 6
5 5
5 6
5 6
12 10
12 5
12 5
12 6
10 5
10 5
10 6
5 5
5 6
5 6
12 10
12 5
12 5
12 6
10 5
10 5
10 6
5 5
5 6
5 6
12 10
12 5
12 5
12 6
10 5
10 5
10 6
5 5
5 6
5 6
12 10
12 5
12 5
12 6
10 5
10 5
10 6
5 5
5 6
5 6
12 10
12 5
12 5
12 6
10 5
10 5
10 6
5 5
5 6
5 6
12 10
12 5
12 5
12 6
10 5
10 5
10 6
5 5
5 6
5 6
12 10
12 5
12 5
12 6
10 5
10 5
10 6
5 5
5 6
5 6
The slowest run took 16.57 times longer than the fastest. This could mean that an intermediate result is being cached.
1.75 ms ± 2.44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Current Ver

In [15]:
@numba.njit
def jaccard_sim2(a, b):
    #: assume union is non-empty since each sentence >= 1 word
    return np.sum(a & b) / np.sum(a | b)


@numba.njit
def sigmoid2(x):
    return 1 / (1 + np.exp(-x))


@numba.njit
def cohesion2(chromosome, sim, doc):
    total = 0
    for p in np.unique(chromosome):
        sents = doc[np.where(chromosome == p)]
        k = len(sents)
        #: combinations choose 2
        for i in range(k-1):
            for j in range(i+1, k):
                total += sim(sents[i], sents[j]) / len(sents)  
    return total


@numba.njit
def separation2(chromosome, sim, doc):
    total = 0
    k = len(np.unique(chromosome))
    #: combinations choose 2
    for p in range(k-1):
        for q in range(p+1, k):
            sents_p = doc[np.where(chromosome == p)]
            sents_q = doc[np.where(chromosome == q)]
            #: product
            m, n = len(sents_p), len(sents_q)
            for i in range(m):
                for j in range(n):
                    total += sim(sents_p[i], sents_q[j]) / m / n
    return total


@numba.njit
def cohesion_separation2(chromosome, sim, doc):
    coh = cohesion2(chromosome, sim, doc)
    sep = separation2(chromosome, sim, doc)
    return (1 + sigmoid2(coh)) ** sep

In [9]:
assert cohesion2(chrom, jaccard_sim2, doc) == cohesion(chrom, jaccard_sim, doc)
assert separation2(chrom, jaccard_sim2, doc) == separation(chrom, jaccard_sim, doc)
assert cohesion_separation2(chrom, jaccard_sim2, doc) == cohesion_separation(chrom, jaccard_sim, doc)

In [51]:
%timeit np.unique(chrom)

5.15 µs ± 30.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
