In [1]:
import os
os.chdir("../")
from joblib import Parallel, delayed

import semiolog as slg

semiotic = slg.Cenematic("fr_wiki",requested_cpu=4)

Vocabulary will not be loaded from file.



In [2]:
from collections import Counter
import operator
from functools import reduce
import regex as re
import time

In [3]:
def count(chain):
    count = Counter()
    for pair in zip(chain,chain[1:]):
        count[pair] += 1
    return count

def normalize(chain):
    return list((" ".join(chain)).replace(" ",""))

def agglutinate_chain(pair, chain_list):
    chain_list = " ".join(chain_list) 
    bigram = re.escape(" ".join(pair))
    p = re.compile(r"(?<!\S)" + bigram + r"(?!\S)")
    chain_list = p.sub("".join(pair), chain_list)
    chain_list = chain_list.split()
    return chain_list

In [8]:
# vocab_size = 10000
chunksize = 250000 #int(vocab_size/semiotic.config.system.cpu_count)+1
chains = [semiotic.corpus.train[i*chunksize:i*chunksize+chunksize] for i in range(semiotic.config.system.cpu_count)]

In [9]:
start = time.perf_counter()

with Parallel(n_jobs=semiotic.config.system.cpu_count) as parallel:
    chains = [normalize(chain) for chain in chains]
    n_iter = 0
    while n_iter<10:
        result = parallel(delayed(count)(chain) for chain in chains)
        pairs = reduce(operator.add, result)
        best_pair = "".join(pairs.most_common(1)[0][0])
        
        chains = parallel(delayed(agglutinate_chain)(best_pair,chain) for chain in chains)
        n_iter += 1
    freqs = parallel(delayed(Counter)(chain) for chain in chains)
    freq = reduce(operator.add, freqs)
    
print(time.perf_counter()-start)

90.00467734


In [6]:
vocab_size = chunksize * semiotic.config.system.cpu_count
chain = semiotic.corpus.train[:vocab_size]

In [7]:
start = time.perf_counter()
chain = normalize(chain)
n_iter = 0
while n_iter<10:
    pairs = count(chain)
    best_pair = "".join(pairs.most_common(1)[0][0])
    chain = agglutinate_chain(best_pair,chain)
    n_iter += 1
freq_seq = Counter(chain)

print(time.perf_counter()-start)

59.490051267


In [9]:
for l,r in list(zip(freq.most_common(),freq_seq.most_common())):
    if l!=r:
        print(l,r)

In [10]:
n = 50
list(zip(freq.most_common(n),freq_seq.most_common(n)))

[(('a', 441290), ('a', 441290)),
 (('t', 393100), ('t', 393100)),
 (('i', 368352), ('i', 368352)),
 (('e', 363301), ('e', 363301)),
 (('u', 360618), ('u', 360618)),
 (('r', 335156), ('r', 335156)),
 (('l', 300385), ('l', 300385)),
 (('o', 267331), ('o', 267331)),
 (('s', 263724), ('s', 263724)),
 (('c', 211612), ('c', 211612)),
 (('d', 181825), ('d', 181825)),
 (('n', 180368), ('n', 180368)),
 (('es', 176590), ('es', 176590)),
 (('p', 176275), ('p', 176275)),
 (('m', 173766), ('m', 173766)),
 (('é', 162941), ('é', 162941)),
 (('en', 122018), ('en', 122018)),
 (('on', 112878), ('on', 112878)),
 (('de', 103861), ('de', 103861)),
 ((',', 93686), (',', 93686)),
 (('an', 91953), ('an', 91953)),
 (('g', 80184), ('g', 80184)),
 (('v', 79655), ('v', 79655)),
 (('le', 76879), ('le', 76879)),
 (('re', 72665), ('re', 72665)),
 (('ti', 70646), ('ti', 70646)),
 (('h', 69910), ('h', 69910)),
 (('er', 65824), ('er', 65824)),
 (('f', 65681), ('f', 65681)),
 (('.', 64470), ('.', 64470)),
 (('is', 63108