In [1]:
import numpy as np 
import collections

unigrams = collections.defaultdict(int)

with open('../../NLP_Resources/polish_corpora.txt') as f:
    for line in f:
        line = line.lower()
        words = line.split()
        for word in words:
            unigrams[word] += 1

In [2]:
def load_data(file):
    K = 5
    with open(file) as f:
        temp = 1
        res = [x.split() for x in f if int(x.split()[0]) >= K]
        return res
bigramData = load_data('../../NLP_Resources/poleval_2grams.txt')

In [25]:
unigramSum = 0.0
unigramsPPMI = collections.defaultdict(int)

alfa = 0.75

for key in unigrams.keys():
    unigramSum += unigrams[key]**alfa

for key in unigrams.keys():
    unigramsPPMI[key] = (unigrams[key]**alfa / unigramSum )


In [26]:

bigramSum = 0.0
bigramPPMI = collections.defaultdict(int)

for el in bigramData:
    value, w1, w2 = el
    bigramSum += int(value)**alfa

for el in bigramData:
    value, w1, w2 = el
    bigramPPMI[w1+'#'+w2] = (int(value)**alfa / bigramSum )

## a) PPMI

In [44]:
testSet = ["dziewczynka", "kobieta", "helikopter", "śmigłowiec", "chłopak", "mężczyzna", "herbata", "kawa", "piwo", "wino"]

results = collections.defaultdict(list)
i = 0
for bigram in bigramPPMI:
    w1, w2 = bigram.split('#')
    if w1 in testSet or w2 in testSet:
        if unigramsPPMI.get(w1, 0) == 0 or unigramsPPMI.get(w2, 0) == 0:
            continue
        temp = np.maximum(0.0, np.log(bigramPPMI[bigram] / unigramsPPMI[w1] / unigramsPPMI[w2]))

    if w1 in testSet:
        results[w1].append((temp, w2))
    elif w2 in testSet:
        results[w2].append((temp, w1))
    


In [46]:
import pprint

for word in testSet:
    results[word] = sorted(results[word], reverse=True)
    print(word)
    pprint.pprint(results[word][:10])
    print()

dziewczynka
[(9.731769047508996, '7-letnia'),
 (9.695176424381922, 'dwunastoletnia'),
 (9.620454043670291, '9-letnia'),
 (9.543283226298316, '12-letnia'),
 (9.534995849158378, 'ośmioletnia'),
 (9.479414870043087, '10-letnia'),
 (9.34267370244762, '13-letnia'),
 (9.319234294819541, 'kilkunastoletnia'),
 (9.242525157979228, '14-letnia'),
 (9.198394782962028, 'czteroletnia')]

kobieta
[(9.24640889032487, '65-letnia'),
 (8.99405471285896, '54-letnia'),
 (8.972314060204022, 'sposora'),
 (8.857313545263494, 'ciezarna'),
 (8.797281514508342, '38-letnia'),
 (8.64155199092466, 'zamozna'),
 (8.64155199092466, '50-letnia'),
 (8.621727048123542, 'spodziewająca'),
 (8.505062881809131, 'dojrzala'),
 (8.474194327439001, '22-letnia')]

helikopter
[(9.325936454415205, 'ratowniczy'),
 (9.310259854943007, 'szturmowy'),
 (9.155910276550829, 'ratunkowy'),
 (8.826907255819535, 'lądował'),
 (8.58017299565743, 'policyjny'),
 (8.327818818191519, 'policyjny'),
 (6.801831480583695, 'wojskowy'),
 (6.6650903129882

## b) PSM


In [48]:
bigramCount = collections.defaultdict(int)

for el in bigramData:
    value, w1, w2 = el
    bigramCount[w1+'#'+w2] += int(value)

unigramTrueSum = 0.0
for el in unigrams:
    unigramTrueSum += unigrams[el]

In [49]:
results = collections.defaultdict(list)

for bigram in bigramPPMI:
    w1, w2 = bigram.split('#')
    if w1 in testSet or w2 in testSet:
        if unigrams.get(w1, 0) == 0 or unigrams.get(w2, 0) == 0:
            continue
        temp = bigramCount[bigram] * (np.log(bigramCount[bigram]) - np.log(unigrams[w1] * unigrams[w2] / unigramTrueSum) - 1)

    if w1 in testSet:
        results[w1].append((temp, w2))
    elif w2 in testSet:
        results[w2].append((temp, w1))

In [50]:
for word in testSet:
    results[word] = sorted(results[word], reverse=True)
    print(word)
    pprint.pprint(results[word][:10])
    print()

dziewczynka
[(1433.5082621986912, 'mała'),
 (173.55058477035226, 'letnia'),
 (131.37311926138224, 'kilkuletnia'),
 (101.1708296343594, 'miała'),
 (96.9205436775878, 'czteroletnia'),
 (93.71573937803157, 'została'),
 (89.21380949573266, 'każda'),
 (82.67086769027179, '10-letnia'),
 (77.75784078775601, 'zmarła'),
 (75.7866723043003, 'dwunastoletnia')]

kobieta
[(3502.559794836202, 'każda'),
 (2360.058859048639, 'młoda'),
 (1601.444719586835, 'pierwsza'),
 (978.8323799190453, 'że'),
 (785.4416127612078, 'jedna'),
 (635.8306807833914, 'gdy'),
 (541.2543931989528, 'ma'),
 (540.791019689812, 'starsza'),
 (530.1435296603872, 'powinna'),
 (519.6900251494683, 'piękna')]

helikopter
[(87.5431250759608, 'ratunkowy'),
 (62.86709854657339, 'ratowniczy'),
 (55.90663959816749, 'policyjny'),
 (44.80055972726157, 'szturmowy'),
 (41.68646289137557, 'jeden'),
 (41.57820906643843, 'lądował'),
 (38.250952815584995, 'policyjny'),
 (33.6932446778394, 'wojskowy'),
 (27.166096114229727, 'wojskowy'),
 (26.45902

## c) Kolokacje gramatyczno-słowowe

In [51]:
wordToSupertag = dict()
sufToSupertag = collections.defaultdict(list)

with open('../../NLP_Resources/supertags.txt') as f:
    temp = [tuple(x.split()) for x in f]
    wordToSupertag = {word: tag for (word, tag) in temp}
    for word,tag in temp:
        sufToSupertag[word[-3:]].append(tag)

In [60]:
import random
tagBigrams = collections.defaultdict(int)

for bigram in bigramCount:
    w1, w2 = bigram.split('#')
    tag1 = wordToSupertag.get(w1, '')
    tag2 = wordToSupertag.get(w2, '')
    tagBigrams[tag1+'#'+tag2] += bigramCount[bigram]


In [61]:
results = collections.defaultdict(list)
for bigram in bigramPPMI:
    w1, w2 = bigram.split('#')
    if w1 in testSet or w2 in testSet:
        if unigrams.get(w1, 0) == 0 or unigrams.get(w2, 0) == 0:
            continue
        tag1 = wordToSupertag.get(w1, '')
        if tag1 == '':
            tag1 = random.choice(sufToSupertag.get(w1[-3:], ["empty"]))
        
        tag2 = wordToSupertag.get(w2, '')
        if tag2 == '':
            tag2 = random.choice(sufToSupertag.get(w2[-3:], ["empty"]))

        hash = tag1 + '#' + tag2
        temp = bigramCount[bigram] + tagBigrams[hash]

    if w1 in testSet:
        results[w1].append((temp, w2))
    elif w2 in testSet:
        results[w2].append((temp, w1))

In [62]:
for word in testSet:
    results[word] = sorted(results[word], reverse=True)
    print(word)
    pprint.pprint(results[word][:10])
    print()

dziewczynka
[(408098, 'mała'),
 (407913, 'letnia'),
 (407912, 'każda'),
 (407904, 'kilkuletnia'),
 (407900, 'czteroletnia'),
 (407897, 'trzyletnia'),
 (407897, 'pewna'),
 (407897, 'ośmioletnia'),
 (407897, 'grzeczna'),
 (407897, 'dwunastoletnia')]

kobieta
[(474280, 'ciężarna'),
 (474251, 'wysoka'),
 (474243, 'samotna'),
 (474242, 'cmentarna'),
 (474240, 'podeszła'),
 (474240, 'pełna'),
 (474240, 'chora'),
 (474239, 'idealna'),
 (474238, 'współczesna'),
 (408546, 'każda')]

helikopter
[(963993, 'na'),
 (963991, 'w'),
 (805849, 'wojskowy'),
 (805849, 'policyjny'),
 (794961, 'ratunkowy'),
 (794958, 'policyjny'),
 (794957, 'wojskowy'),
 (794956, 'szturmowy'),
 (611422, 'w'),
 (209110, 'i')]

śmigłowiec
[(2905, 'transportowy'),
 (2900, 'wielozadaniowy'),
 (2900, 'ratunkowy'),
 (2894, 'szturmowy'),
 (2894, 'nowy'),
 (2890, 'przeciwpancerny'),
 (2889, 'wielosilnikowy'),
 (2889, 'pokładowy'),
 (2889, 'bojowy'),
 (2873, 'lekki')]

chłopak
[(97576, 'młody'),
 (97319, 'fajny'),
 (97313, 'dobry')

## d) Własna modyfikacja

In [63]:
#Tagi + PSM

for bigram in bigramPPMI:
    w1, w2 = bigram.split('#')
    if w1 in testSet or w2 in testSet:
        if unigrams.get(w1, 0) == 0 or unigrams.get(w2, 0) == 0:
            continue
        tag1 = wordToSupertag.get(w1, '')
        if tag1 == '':
            tag1 = random.choice(sufToSupertag.get(w1[-3:], ["empty"]))
        
        tag2 = wordToSupertag.get(w2, '')
        if tag2 == '':
            tag2 = random.choice(sufToSupertag.get(w2[-3:], ["empty"]))

        hash = tag1 + '#' + tag2
        temp = tagBigrams[hash] + bigramCount[bigram] * (np.log(bigramCount[bigram]) - np.log(unigrams[w1] * unigrams[w2] / unigramTrueSum) - 1)

    if w1 in testSet:
        results[w1].append((temp, w2))
    elif w2 in testSet:
        results[w2].append((temp, w1))

In [64]:
for word in testSet:
    results[word] = sorted(results[word], reverse=True)
    print(word)
    pprint.pprint(results[word][:10])
    print()

dziewczynka
[(409322.5082621987, 'mała'),
 (408098, 'mała'),
 (408062.55058477033, 'letnia'),
 (408020.3731192614, 'kilkuletnia'),
 (407985.9205436776, 'czteroletnia'),
 (407978.21380949573, 'każda'),
 (407964.7866723043, 'dwunastoletnia'),
 (407963.0780795019, 'ośmioletnia'),
 (407955.68645066494, 'trzyletnia'),
 (407953.05597575393, 'grzeczna')]

kobieta
[(474592.87194314407, 'ciężarna'),
 (474281.8654898933, 'samotna'),
 (474280, 'ciężarna'),
 (474279.3026841755, 'cmentarna'),
 (474265.37538172345, 'podeszła'),
 (474263.6318337567, 'wysoka'),
 (474260.74140603363, 'chora'),
 (474251.6703273105, 'idealna'),
 (474251, 'wysoka'),
 (474248.91431328084, 'współczesna')]

helikopter
[(963993, 'na'),
 (963991, 'w'),
 (963977.5676694525, 'na'),
 (963972.548912268, 'w'),
 (805882.2509528156, 'policyjny'),
 (805871.1660961142, 'wojskowy'),
 (805849, 'wojskowy'),
 (805849, 'policyjny'),
 (795038.543125076, 'ratunkowy'),
 (795006.9066395982, 'policyjny')]

śmigłowiec
[(3049.539032949492, 'transp