In [1]:
from logger import FileProcessLogger, file_line_count

In [6]:
bigrams = {}
filename = './dane/poleval_2grams.txt'
with open(filename, 'r', encoding='utf-8') as file:
    logger = FileProcessLogger(filename)
    for line in file:
        logger.update()
        n, word1, word2 = line.lower().split()
        n = int(n)
        if n < 10: 
            continue
        if word1 in bigrams:
            bigrams[word1][word2] = n
        else:
            bigrams[word1] = { word2: n }

Processed 10% of the file contents...
Processed 20% of the file contents...
Processed 30% of the file contents...
Processed 40% of the file contents...
Processed 50% of the file contents...
Processed 60% of the file contents...
Processed 70% of the file contents...
Processed 80% of the file contents...
Processed 90% of the file contents...
Processed 100% of the file contents...


In [10]:
tags = {}
filename = './dane/supertags.txt'
with open(filename, 'r', encoding='utf-8') as file:
    for line in file:
        word, tag = line.rstrip('\n').split(' ')
        tags[word] = tag

In [11]:
bigram_tags = {}
filename = './dane/poleval_2grams.txt'
with open(filename, 'r', encoding='UTF-8') as file:
    logger = FileProcessLogger(filename)
    for line in file:
        logger.update()
        n, word1, word2 = line.lower()[:-1].split(' ')
        if word1 in tags and word2 in tags:
            n, tag1, tag2 = int(n), tags[word1], tags[word2]
            if tag1 not in bigram_tags:
                bigram_tags[tag1] = { tag2: n }
            else:
                if tag2 not in bigram_tags[tag1]:
                    bigram_tags[tag1][tag2] = 0
                bigram_tags[tag1][tag2] += n

Processed 10% of the file contents...
Processed 20% of the file contents...
Processed 30% of the file contents...
Processed 40% of the file contents...
Processed 50% of the file contents...
Processed 60% of the file contents...
Processed 70% of the file contents...
Processed 80% of the file contents...
Processed 90% of the file contents...
Processed 100% of the file contents...


In [12]:
from math import log as ln

In [13]:
def popular_words(bigrams, skip, take):
    words = [(word, sum(bigrams[word].values())) for word in bigrams]
    return sorted(words, key=lambda x: x[1], reverse=True)[skip:skip+take]

In [8]:
print(popular_words(bigrams, 1000, 10))

[('uzyskał', 35775), ('posiedzeniu', 35771), ('każdej', 35706), ('zmarł', 35687), ('stało', 35666), ('francji', 35602), ('procesu', 35554), ('oni', 35518), ('poziomu', 35510), ('zjednoczonych', 35396)]


In [15]:
def PPMI(x, y, bigrams, N):
    Px = 0
    Py = 0
    Pxy = 0.001

    if x in bigrams:
        Px = sum(bigrams[x].values())
        if y in bigrams[x]:
            Pxy = bigrams[x][y]
    for word in bigrams:
        if x in bigrams[word]:
            Px += bigrams[word][x]
     
    if y in bigrams:
        Py = sum(bigrams[y].values())
    for word in bigrams:
        if y in bigrams[word]:
            Py += bigrams[word][y]

    PMI = ln(N * Pxy / Px / Py)
    return max(0, PMI) 

In [19]:
def PSM(x, y, bigrams, N):
    Px = 0
    Py = 0
    Pxy = 0.001

    if x in bigrams:
        Px = sum(bigrams[x].values())
        if y in bigrams[x]:
            Pxy = bigrams[x][y]
    for word in bigrams:
        if x in bigrams[word]:
            Px += bigrams[word][x]
     
    if y in bigrams:
        Py = sum(bigrams[y].values())
    for word in bigrams:
        if y in bigrams[word]:
            Py += bigrams[word][y]
    
    estimatedP = Px * Py / N

    return Pxy * (ln(Pxy) - ln(estimatedP) - 1)

In [22]:
from math import inf


def get_tag(x):
    if x in tags:
        return tags[x]
    suffix = x[-3:]
    if suffix in tags:
        return tags[suffix]
    return None


def tagged_PPMI(x, y, bigrams, bN, tag_bigrams, tbN):
    tag_x = get_tag(x)
    if tag_x is None: return -inf
    tag_y = get_tag(y)
    if tag_y is None: return -inf

    return PPMI(tag_x, tag_y, tag_bigrams, tbN) * PPMI(x, y, bigrams, bN)

In [27]:
bN = 0
for w in bigrams:
    bN += sum(bigrams[w].values())

In [28]:
tN = 0
for t in bigram_tags:
    tN += sum(bigram_tags[t].values())

In [29]:
methods = [
    ('PPMI', lambda x,y: PPMI(x,y, bigrams, bN)), 
    ('PSM', lambda x,y: PSM(x, y, bigrams, bN)), 
    ('tagged_PPMI', lambda x,y: tagged_PPMI(x, y, bigrams, bN, bigram_tags, tbN))]
for word, _ in popular_words(bigrams, 1000, 10):
    print(f'testing for word: "{word}"')
    for name, method in methods:
        print()
        print('Method:', name)
        scores = [(method(word, successor), successor) for successor in bigrams[word].keys()]
        scores.sort(key=lambda score: score[0], reverse=True)
        for (score, successor) in scores[:10]:
            print(word , successor, score)
    print(20 * '-')

testing for word: "uzyskał"

Method: PPMI
uzyskał reelekcji 7.210127276806494
uzyskał bakalaureat 7.0995949727462735
uzyskał licencjat 7.016188217012809
uzyskał habilitację 6.815102527492896
uzyskał magisterium 6.7634162710211285
uzyskał mandat 6.653081042942986
uzyskał doktorat 6.552324052563292
uzyskał doktoraty 6.529281499549021
uzyskał dyplom 6.348639483936616
uzyskał profesurę 6.273917371176575

Method: PSM
uzyskał mandat 20695.929698214273
uzyskał stopień 17850.419369789994
uzyskał tytuł 14695.222506511493
uzyskał doktorat 6124.213429977311
uzyskał dyplom 5969.081664073263
uzyskał status 3276.1616672223213
uzyskał reelekcję 2872.489998525407
uzyskał licencjat 2027.4554291333168
uzyskał habilitację 1872.4630138527125
uzyskał magisterium 1711.7346324932755

Method: tagged_PPMI
uzyskał habilitację 7.213870368757442
uzyskał profesurę 6.641019183113157
uzyskał reelekcję 6.238352602953533
uzyskał miano 5.954402955347609
uzyskał akceptację 5.922735162584043
uzyskał nominację 5.812641116