In [1]:
from scipy.sparse import load_npz
import pandas as pd
import numpy as np
import json 
from countmatrix_editor import filter_countmatrix, make_dicts, quotient_countmatrix, w2c_totable
import time

In [2]:
#Import targets (not needed for this test)
targets = pd.read_csv('sparse data/count_target.csv')

#Import vocab/column dictionary
with open('sparse data/vocab_json_file.json','r') as f:
    c2w = json.load(f)
c2w = {int(k):v for k,v in c2w.items()}
w2c = {v:k for k,v in c2w.items()}

#import sparse matrix of word counts
textcv = load_npz('sparse data/count_matrix.npz')

#Create vocab dataframe from dictionary
vocab = w2c_totable(w2c)

vocab.iloc[np.random.randint(0,vocab.shape[0],size = 10),:]

Unnamed: 0,word,col
412154,necroid,412154
675282,vacationshouldremainthesame,675282
700134,whites,700134
541450,ronnit,541450
459385,parodyexempt,459385
364252,majedie,364252
575601,shrinkability,575601
65687,beingimporte,65687
41368,artypoker,41368
594274,spirometric,594274


In [3]:
def test_subvocab(subvocab, origmat, subdict, origdict, k = 50):
    words = subvocab['word'].values
    for k in range(k):
        word = np.random.choice(words)
        print(f'Occurences of "{word}" match in the two matrices: ', (origmat.getcol(origdict[word]) != subset.getcol(subdict[word])).sum()==0)

def test_merged(rel, mergedvocab, mergedmat, origmat, mergedict, origdict, k = 50):
    words = mergedvocab['word'].values
    keys =  rel.keys()
    for k in range(k):
        word = np.random.choice(words)
        while word in keys:
            word = np.random.choice(words)
        print(f'Occurences of "{word}" match in the two matrices: ', (origmat.getcol(origdict[word]) != mergedmat.getcol(mergedict[word])).sum()==0)

def test_eqrel(rel, originalmat, mergedmat, originaldict, mergeddict):
    for key, values in rel.items():
        print(f'Occurences of "{key}" in merged matrix match total occurrences of "{key}"',
              *[f', "{value}"' for value in values[:-1]],
              *[',' for k in set('*') if len(values)>1],
              f'and "{values[-1]}" in orginal: ',
              (mergedmat.getcol(mergeddict[key]) != (sum([originalmat.getcol(originaldict[value]) for value in values]) + originalmat.getcol(originaldict[key]))).sum() == 0)

# Test dropping words

In [4]:
random_drops = np.random.choice(vocab['col'].values, size = 100, replace = False) #choose 100 random words to drop from the vocab
start = time.time()
smaller_vocab = vocab[~vocab['col'].isin(random_drops)].copy() #create smaller vocab with words dropped (columns from original)
end = time.time()
print(f'Time to subset vocab table: {end-start}')
print(f'New vocab is smaller by {vocab.shape[0] - smaller_vocab.shape[0]} lexical items') 

Time to subset vocab table: 0.03452587127685547
New vocab is smaller by 100 lexical items


In [5]:
start = time.time()
subvocab, subset = filter_countmatrix(smaller_vocab, textcv) #use smaller vocab to subset the matrix (remove columns), and renumber vocab table
end = time.time()
print(f'Time to subset matrix and vocab table: {end-start}')

Time to subset matrix and vocab table: 1.310939073562622


In [6]:
start = time.time()
sub_w2c, sub_c2w = make_dicts(subvocab) #convert new vocab table to dictionaries
end = time.time()
print(f'Time to create vocab dictionaries from table: {end-start}')

Time to create vocab dictionaries from table: 5.505706071853638


In [7]:
#Test if new dictionary matches for a sample of words
test_subvocab(subvocab, textcv, sub_w2c, w2c)

Occurences of "satnam" match in the two matrices:  True
Occurences of "systemized" match in the two matrices:  True
Occurences of "hsdailyfeature" match in the two matrices:  True
Occurences of "vogle" match in the two matrices:  True
Occurences of "rustup" match in the two matrices:  True
Occurences of "falltuesdayekttitab" match in the two matrices:  True
Occurences of "gpcflatlng" match in the two matrices:  True
Occurences of "oonly" match in the two matrices:  True
Occurences of "ovma" match in the two matrices:  True
Occurences of "broadcastersinclude" match in the two matrices:  True
Occurences of "veneral" match in the two matrices:  True
Occurences of "nnotice" match in the two matrices:  True
Occurences of "designrequirement" match in the two matrices:  True
Occurences of "casiegraphic" match in the two matrices:  True
Occurences of "inportant" match in the two matrices:  True
Occurences of "incurred" match in the two matrices:  True
Occurences of "worksugarsugarboard" match 

# Testing merging words

For the sake of example, we will merge extra words together which we may want to keep separate

In [8]:
eqrel = {'america':['american', 'americanize', 'americanisation','americanization'],
        'color':['colour']}

In [9]:
start = time.time()
mergedvocab, mergedmatrix = quotient_countmatrix(eqrel,vocab, textcv)
end = time.time()
print(f'Time to create merged matrix and vocab table: {end-start}')

Time to create merged matrix and vocab table: 15.622627973556519


In [10]:
start = time.time()
w2c_merge, c2w_merge = make_dicts(mergedvocab)
end = time.time()
print(f'Time to create vocab dictionaries from table: {end-start}')

Time to create vocab dictionaries from table: 5.371376037597656


In [11]:
#test if corresponding columns match for vocab items not merged with others
test_merged(eqrel, mergedvocab, mergedmatrix,textcv,w2c_merge, w2c)

Occurences of "parvaaz" match in the two matrices:  True
Occurences of "designrequirement" match in the two matrices:  True
Occurences of "learnednot" match in the two matrices:  True
Occurences of "cumberlidge" match in the two matrices:  True
Occurences of "simonsberg" match in the two matrices:  True
Occurences of "ptc" match in the two matrices:  True
Occurences of "hoffmann" match in the two matrices:  True
Occurences of "ndeam" match in the two matrices:  True
Occurences of "toneed" match in the two matrices:  True
Occurences of "pointspartner" match in the two matrices:  True
Occurences of "dobyli" match in the two matrices:  True
Occurences of "onrushing" match in the two matrices:  True
Occurences of "inseam" match in the two matrices:  True
Occurences of "sinyoungcho" match in the two matrices:  True
Occurences of "yuntie" match in the two matrices:  True
Occurences of "oequieten" match in the two matrices:  True
Occurences of "elemec" match in the two matrices:  True
Occuren

In [12]:
#test that equivalence classes summed correctly
test_eqrel(eqrel,textcv,mergedmatrix,w2c,w2c_merge)

Occurences of "america" in merged matrix match total occurrences of "america" , "american" , "americanize" , "americanisation" , and "americanization" in orginal:  True
Occurences of "color" in merged matrix match total occurrences of "color" and "colour" in orginal:  True


Worth making dictionary as dictionary access works an order of magnitude faster

In [13]:
word = np.random.choice(vocab['word'].values)
word

'nufem'

In [14]:
%%timeit
vocab[vocab.word == word].loc[:,'col'].values[0]

39.2 ms ± 2.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
%%timeit -n 10 -r 7
w2c[word]

The slowest run took 57.27 times longer than the fastest. This could mean that an intermediate result is being cached.
264 ns ± 573 ns per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%%timeit 
w2c[word]

26 ns ± 0.0324 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
