In [2]:
import colibricore
import numpy as np

In [3]:
# Text file
basename = 'wsj_raw_repl_pad'
text_file = basename + '.txt'

# Instantiate class encoder
classencoder = colibricore.ClassEncoder()

# Build classes
classencoder.build(text_file)

# Save class file
class_file = basename + '.colibri.cls'
classencoder.save(class_file)

print("Encoded", len(classencoder), 'classses')

Encoded 13082 classses


In [4]:
corpus_file = basename + '.colibri.dat'
classencoder.encodefile(text_file, corpus_file)

In [5]:
# Check if encoding worked as planned
classdecoder = colibricore.ClassDecoder(class_file)
decoded = classdecoder.decodefile(corpus_file)

# Print
print('##### Decoded #####')
print(decoded[:150])

##### Decoded #####
<s> Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
<s> Mr. Vinken is chairman of Elsevier N.V. , the Dutch pu


In [6]:
## Pattern model

#Set the options
options = colibricore.PatternModelOptions(mintokens=2,maxlength=3)

#Instantiate an empty unindexed model 
model = colibricore.UnindexedPatternModel()

#Train it on our corpus file (class-encoded data, not plain text)
model.train(corpus_file, options)

In [7]:
# Check model
print("Found " , len(model), " patterns:")

for pattern, count in list(model.items())[:10]:
    print(pattern.tostring(classdecoder), count)

Found  21373  patterns:
President Bush to 2
a line-item veto 4
to exercise a 2
of steel shipped 2
ton of steel 2
per ton of 2
to strike . 2
an ounce to 2
price of the 2
the grain industry's 2


In [8]:
## query specific patterns
# The bigrams starting with <s>
qp = classencoder.buildpattern('<s> I')
print('How many "<s> I"')
print(model[qp])

print('Top 10 patterns')
for pattern, count in sorted(model.top(10), key=lambda x:-x[1]):
    print(pattern.tostring(classdecoder), count)

How many "<s> I"
13
Top 10 patterns
, 4888
the 4047
<s> 3613
. 3397
of 2322
to 2165
a 1875
in 1573
and 1512
" 1407
for 815


In [9]:
# All N-grams
print('Unigram count:')
print(len(list(model.filter(0, colibricore.Category.NGRAM, 1))))
print('Bigram count:')
print(len(list(model.filter(0, colibricore.Category.NGRAM, 2))))
print('Trigram count:')
print(len(list(model.filter(0, colibricore.Category.NGRAM, 3))))

# All bigrams start with '<s>'
start_bigram = []
for pattern, count in model.filter(0, colibricore.Category.NGRAM, 2):
    if pattern[0].tostring(classdecoder) == '<s>':
        start_bigram.append((pattern, count))
print()
print('Bigram starting with <s> count:')
print(len(start_bigram))
print('Top 20 of them')
for pattern, count in sorted(start_bigram, key=lambda x: x[1], reverse=True)[:20]:
    print(pattern.tostring(classdecoder), count)

Unigram count:
5955
Bigram count:
10419
Trigram count:
4999

Bigram starting with <s> count:
313
Top 20 of them
<s> The 636
<s> In 184
<s> But 151
<s> Mr. 117
<s> " 89
<s> A 88
<s> He 60
<s> It 57
<s> They 45
<s> And 37
<s> For 36
<s> That 32
<s> If 28
<s> As 28
<s> This 27
<s> At 25
<s> Some 24
<s> While 21
<s> Sales 19
<s> These 18


In [11]:
def unigram_entropy(word, model, classencoder, classdecoder):
    qp = classencoder.buildpattern(word)
    if qp not in model:
        print('"{}" not in model'.format(word))
        return None
        
    total_occur = np.sum(count for pattern, count in model.filter(0, colibricore.Category.NGRAM, 2) \
                         if pattern[0].tostring(classdecoder) == word)
    entropy = 0.0
    for pattern, count in model.filter(0, colibricore.Category.NGRAM, 2):
        if pattern[0].tostring(classdecoder) == word:
            prob = count / total_occur
            entropy += - prob * np.log2(prob)
    return entropy

In [12]:
print(unigram_entropy('<s>', model, classencoder, classdecoder))
print(unigram_entropy('the', model, classencoder, classdecoder))
print(unigram_entropy('and', model, classencoder, classdecoder))
print(unigram_entropy('I', model, classencoder, classdecoder))

6.21843647371
8.61231756445
6.90829275255
3.57103797056


In [145]:
model.occurrencecount(classencoder.buildpattern('<s> The'))

636

In [116]:
# Train skipgram on indexed model
options = colibricore.PatternModelOptions(mintokens=2,maxlength=3, doskipgrams=True)

corpus_indexed = colibricore.IndexedCorpus(corpus_file)
model_indexed = colibricore.IndexedPatternModel(reverseindex=corpus_indexed)

model_indexed.train(corpus_file, options)

In [118]:
# Skipgram patterns included
print("Found " , len(model_indexed), " patterns:")

Found  21793  patterns:


In [124]:
for pattern, count in sorted( model_indexed.top(10, colibricore.Category.SKIPGRAM), key=lambda x:x[1]*-1 ):
    print(pattern.tostring(classdecoder), " -- ", count)

the {*} of  --  181
, {*} ,  --  171
<s> {*} the  --  142
, {*} the  --  141
<s> {*} ,  --  109
a {*} of  --  91
the {*} .  --  82
, {*} says  --  82
, {*} said  --  80
to {*} the  --  76
the {*} ,  --  71


In [141]:
qp = classencoder.buildpattern('<s>')
if qp in model_indexed:
    print('Occurrence count', model_indexed.occurrencecount(qp))
    print('Frequency', model_indexed.frequency(qp))
else:
    print('qp not in model_indexed')

Occurrence count 3613
Frequency 0.04129992455590865


In [19]:
p0, _ = list(model.filter(0, colibricore.Category.NGRAM, 1))[0]

In [21]:
print(type(p0))
print(p0.tostring(classdecoder))

<class 'colibricore.Pattern'>
Dan


In [23]:
p1 = classencoder.buildpattern('Dan')
print(p0 == p1)

True


In [40]:
def unigram_entropy(word, model, classencoder, classdecoder):
    """
    word: a str or a pattern
    model: an instance of colibricore.UnindexedPatternModel
    """
    if isinstance(word, str):
        word = classencoder.buildpattern(word)
    if word not in model:
        # print('"{}" not in model'.format(word))
        return None

    bigram_counts = np.asarray([count for pattern, count in model.filter(0, colibricore.Category.NGRAM, 2) if pattern[0] == word])
    probs = bigram_counts / np.sum(bigram_counts)
    entropy = - np.sum(probs * np.log2(probs))

    return entropy

In [32]:
qp = classencoder.buildpattern('<s>')
print(unigram_entropy_new('<s>', model, classencoder, classdecoder))
print(unigram_entropy_new(qp, model, classencoder, classdecoder))

6.21843647371
6.21843647371


In [34]:
import os
# debug
base_name = 'wsj_raw_repl_lower'
class_file = base_name + '.colibri.cls'
corpus_file = base_name + '.colibri.dat'
if not os.path.exists(class_file) or not os.path.exists(corpus_file):
    _, _, classencoder, classdecoder = encode_file(base_name)
else:
    classencoder = colibricore.ClassEncoder(class_file)
    classdecoder = colibricore.ClassDecoder(corpus_file)

In [35]:
options = colibricore.PatternModelOptions(mintokens=2,maxlength=3)
patternmodel_file = base_name + '.colibri.patternmodel'
if os.path.exists(patternmodel_file):
    model = colibricore.UnindexedPatternModel(patternmodel_file, options)
else:
    model = colibricore.UnindexedPatternModel()
    model.train(corpus_file, options)
    model.write(patternmodel_file)

In [54]:
unigram_entropy('sir', model, classencoder, classdecoder)

-0.0

In [62]:
word = 'i'
qp = classencoder.buildpattern(word)
print(qp)

<colibricore.Pattern object at 0x105e2c110>


In [63]:
np.asarray([count for pattern, count in model.filter(0, colibricore.Category.NGRAM, 2) if pattern[0] == qp])

array([2, 2, 2, 7, 2, 7, 7, 2, 2, 6, 2, 2, 6, 3])

In [64]:
qp.tostring(classdecoder)

'{?}'

In [73]:
for pattern, _ in list(model.filter(0, colibricore.Category.NGRAM, 1))[:10]:
    entropy = unigram_entropy(pattern, model, classencoder, classdecoder)
    print(model[pattern], entropy)

13 1.0
14 1.53049305676
12 0.970950594455
11 1.57095059445
5 -0.0
34 1.5
12 0.650022421648
2 -0.0
2 -0.0
5 -0.0


In [75]:
pattern.tostring(classencoder)

TypeError: Argument 'decoder' has incorrect type (expected colibricore.ClassDecoder, got colibricore.ClassEncoder)

In [83]:
mylist = [1,2,2,1,1,1,2,1,1,1]
rm_idx = []
for i, n in enumerate(mylist):
    if n == 2:
        rm_idx.append(i)
for idx in sorted(rm_idx, reverse=True): del mylist[idx]

print(rm_idx)
print(mylist)

[1, 2, 6]
[1, 1, 1, 1, 1, 1, 1]
