In [1]:
import os
os.chdir("../")

from tqdm.notebook import tqdm, trange
from pyinstrument import Profiler
from joblib import Parallel, delayed, parallel_backend

import semiolog as slg

semiotic = slg.Cenematic("en_bnc",requested_cpu=4)

from collections import Counter, defaultdict
from functools import reduce
import operator
import time

Vocabulary will not be loaded from file.



In [2]:
def pre_process(corpus_chunk, normalizer):
    # Normalize
    chain_zip = normalizer(corpus_chunk)
    # Build list of pairs
    chain_zip = list(zip(chain_zip,chain_zip[1:]))
    # Create a lookup table of all the positions where a pair appears in a corpus
    pair_pos = defaultdict(set)
    for i,k in list(enumerate(chain_zip)):
        pair_pos[k].add(i)
    # From the previous lookup table, create another lookup table of the frequency of each pair (given by the size of the set of its positions)
    pair_len = Counter()
    for k,pos in pair_pos.items():
        pair_len[k] = len(pos)
    
    return (chain_zip, pair_pos, pair_len)


def process_best_pair(job_data, best_pair):
    chain_zip, pair_pos, pair_len = job_data
    chain_zip_len = len(chain_zip)

    for i in pair_pos[best_pair]:
        ## merge best pair with left unit
        left_pair_i = i-1
        while left_pair_i>=0 and chain_zip[left_pair_i] == None: # if left pair is within chain limits but empty (= None) because already merged previously, shift to the left
            left_pair_i -= 1
        if left_pair_i>-1: # proceed only if a left pair was found on the left
            # Remove from left pair positions, the current position (of the pair to be merged)
            left_pair = chain_zip[left_pair_i]
            left_pair_pos = pair_pos[left_pair]
            left_pair_pos.discard(left_pair_i)
            new_pair = (left_pair[0],"".join(best_pair)) # construct new left pair
            pair_pos[new_pair].add(left_pair_i) # add new pair (if non existing) and its position to the pair_pos lookup table
            # update the counts in the pair_len lookuptable
            pair_len[left_pair] -= 1
            pair_len[new_pair] += 1
            # update the list of pairs
            chain_zip[left_pair_i] = new_pair

        ## merge best pair with right unit
        right_pair_i = i+1
        while right_pair_i<chain_zip_len and chain_zip[right_pair_i] == None: # if right pair is within chain limits but empty (= None) because already merged previously, shift to the right
            right_pair_i += 1
        if right_pair_i<chain_zip_len: # proceed only if a left pair was found on the right
            # Remove from right pair positions, the current position (of the pair to be merged)
            right_pair = chain_zip[right_pair_i]
            right_pair_pos = pair_pos[right_pair]
            right_pair_pos.discard(right_pair_i)
            new_pair = ("".join(best_pair), right_pair[1]) # construct new right pair
            pair_pos[new_pair].add(right_pair_i) # add new pair (if non existing) and its position to the pair_pos lookup table
            # update the counts in the pair_len lookuptable
            pair_len[right_pair] -= 1
            pair_len[new_pair] += 1
            # update the list of pairs
            chain_zip[right_pair_i] = new_pair

        # Empty best pair position in list of pairs
        chain_zip[i] = None

    # Remove best pair from lookuptables
    del pair_pos[best_pair]
    del pair_len[best_pair]

    return (chain_zip, pair_pos, pair_len)

def compute_freq(chain_zip):
    # TODO: add the last unit to the decoupling
    freq = [pair[0] for pair in chain_zip if pair != None]
    if chain_zip[-1]!=None: 
        freq.append(chain_zip[-1][-1])
    freq = Counter(freq)
    return freq

In [3]:
delta_voc = 3

In [4]:
requested_cpu = 4 # self.cpu_count
chunksize = int(semiotic.corpus.train_len/40)

corpus_chunks = ["".join(semiotic.corpus.train[i*chunksize:i*chunksize+chunksize]) for i in range(0,requested_cpu)]

normalizer = semiotic.vocab.normalizer.normalize

In [5]:
with Parallel(n_jobs=requested_cpu) as parallel_pool:
    print("Normalize and jobs data...")
    start = time.time()
    jobs_data = parallel_pool(delayed(pre_process)(chunk,normalizer) for chunk in corpus_chunks)

    pair_len_global = reduce(operator.add,[i[-1] for i in jobs_data])
    best_pair = pair_len_global.most_common(1)[0][0]
    
    merges = [best_pair]
    print(f"... computed in {time.time()-start} secs.\n")

    print("Enter loop")
    for _ in range(delta_voc):

        print(f"{_+1}/{delta_voc}: {best_pair}...")
        start = time.time()
        jobs_data = parallel_pool(delayed(process_best_pair)(job_data, best_pair) for job_data in jobs_data)

        pair_len_global = reduce(operator.add,[i[-1] for i in jobs_data])
        best_pair = pair_len_global.most_common(1)[0][0]

        merges.append(best_pair)
        print(f"... computed in {time.time()-start} secs.\n")
    
    print("Compute freq...")
    start = time.time()
    freqs = parallel_pool(delayed(compute_freq)(job_data[0]) for job_data in jobs_data)
    freq = reduce(operator.add, freqs)
    print(f"... computed in {time.time()-start} secs.\n")

Normalize and jobs data
... computed in 36.27325797080994 secs.

Enter loop
0/3: ('t', 'h')...
0 reduce
... computed in 29.919811248779297 secs.

1/3: ('i', 'n')...
1 reduce
... computed in 37.25240898132324 secs.

2/3: ('th', 'e')...
2 reduce
... computed in 38.88198518753052 secs.

Compute freq...
... computed in 16.694128274917603 secs.



In [6]:
freq.most_common(20)

[('e', 4248845),
 ('a', 3255073),
 ('o', 3085943),
 ('s', 2625840),
 ('t', 2575704),
 ('r', 2500714),
 ('i', 2169901),
 ('n', 2074927),
 ('l', 1684089),
 ('d', 1561717),
 ('c', 1266467),
 ('u', 1140697),
 ('m', 1006417),
 ('h', 961428),
 ('f', 903832),
 ('p', 835466),
 ('g', 810828),
 ('w', 780760),
 ('in', 778858),
 ('the', 759571)]

In [8]:
merges

[('t', 'h'), ('i', 'n'), ('th', 'e'), ('a', 'n')]