In [11]:
import math
import os
import requests

from collections import Counter

### Task 1

Tagger set up and will be used via post requests.

In [2]:
print(requests.post('http://localhost:9200', data="Ala ma kota.").text)

Ala	none
	Ala	subst:sg:nom:f	disamb
ma	space
	mieć	fin:sg:ter:imperf	disamb
kota	space
	kot	subst:sg:acc:m2	disamb
.	none
	.	interp	disamb




### Tasks 2 & 3
Only lines containing elements are valuable, it is not exactly every second line (there are some empty lines).

In [3]:
bills_dir = "../bills/"
bills_files = os.listdir(bills_dir)

bigram_counts = Counter()

for file_path in [bills_dir + filename for filename in  bills_files]:
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()  # whole text in lowercase
        
    result = requests.post('http://localhost:9200', data=text.encode('utf-8')).text.split('\n')

    prev_element = (".", ".")  # non letters, will be discarded
    for line in result:
        line = line.split()
        if len(line) == 3:
            word, tag, _ = line
            element = (word, tag.split(":")[0])
            bigram_counts[(prev_element, element)] += 1
            prev_element = element
            
print(bigram_counts.most_common(5))

[((('artykuł', 'brev'), ('.', 'interp')), 84197), ((('ustęp', 'brev'), ('.', 'interp')), 53367), ((('pozycja', 'brev'), ('.', 'interp')), 45473), (((',', 'interp'), ('pozycja', 'brev')), 43373), ((('.', 'interp'), ('1', 'adj')), 40014)]


### Task 4

Dropping AFTER getting all bigram counts.

In [5]:
for key in list(bigram_counts.keys()):
    (w1, _), (w2, _) = key
    if not w1.isalpha() or not w2.isalpha():
        bigram_counts.pop(key)
        
print(bigram_counts.most_common(5))

[((('w', 'prep'), ('artykuł', 'brev')), 32184), ((('o', 'prep'), ('który', 'adj')), 28762), ((('który', 'adj'), ('mowa', 'subst')), 28644), ((('mowa', 'subst'), ('w', 'prep')), 28579), ((('w', 'prep'), ('ustęp', 'brev')), 23564)]


### Task 6
The same implementation of LLR as in previous exercise.

In [9]:
all_bigrams = sum(bigram_counts.values())

first_tokens = Counter()
second_tokens = Counter()

for bigram, value in bigram_counts.items():
    first_tokens[bigram[0]] += value
    second_tokens[bigram[1]] += value

In [12]:
def H(k, N=all_bigrams):
    return sum([k_i / N * math.log(k_i / N + (k_i == 0)) for k_i in k])


bigrams_LLR = {}

for bigram, value in bigram_counts.items():
    k11 = value
    k12 = second_tokens[bigram[1]] - value
    k21 = first_tokens[bigram[0]] - value
    k22 = all_bigrams - (k11 + k12 + k21)
    
    # 2 * N * ( H(k_default) - H(k_rows_summed) - H(k_cols_summed) )
    bigrams_LLR[bigram] = 2 * all_bigrams * (H([k11, k12, k21, k22]) - H([k11 + k12, k21 + k22]) - H([k11 + k21, k12 + k22]))

In [15]:
print(list(bigrams_LLR.items())[:5])

[((('ustawa', 'subst'), ('z', 'prep')), 42103.3587276636), ((('z', 'prep'), ('dzień', 'subst')), 53719.01511066644), ((('o', 'prep'), ('zmiana', 'subst')), 4524.516023343441), ((('zmiana', 'subst'), ('ustawa', 'subst')), 4469.344251147419), ((('ustawa', 'subst'), ('o', 'prep')), 4768.322820801322)]


### Task 7
For partitioning created dict of dicts with all the needed data. Old dict might be removed now.

In [16]:
partitions = {}
partitions_sums = Counter()  # helper to get the top 10 faster

for bigram, value in bigram_counts.items():
    (_, cat1), (_, cat2) = bigram
    if (cat1, cat2) not in partitions:
        partitions[(cat1, cat2)] = Counter()
    partitions[(cat1, cat2)][bigram] = value
    partitions_sums[(cat1, cat2)] += value
    
# bigram_counts.clear()

### Task 8

In [20]:
partitions_sums.most_common(10)

[(('prep', 'subst'), 327378),
 (('subst', 'subst'), 290104),
 (('subst', 'adj'), 274761),
 (('adj', 'subst'), 188238),
 (('subst', 'prep'), 173254),
 (('subst', 'conj'), 85145),
 (('conj', 'subst'), 84393),
 (('prep', 'adj'), 79459),
 (('ger', 'subst'), 77510),
 (('prep', 'brev'), 67230)]

### Task 9

Sorting bigrams by LLR score descending and printing top 5 pairs for each of the category pairs above.

In [28]:
bigrams_LLR = {k: v for k, v in sorted(bigrams_LLR.items(), key=lambda item: item[1], reverse=True)} # sort by value descending

for (cat1, cat2), _ in partitions_sums.most_common(10):
    print("{} - {}:".format(cat1, cat2))
    i = 0
    for ((w1, c1), (w2, c2)), value in bigrams_LLR.items():
        if c1 == cat1 and c2 == cat2:
            print("\t{} {} - score {}".format(w1, w2, value))
            i += 1
            if i == 5:
                break

prep - subst:
	z dzień - score 53719.01511066644
	na podstawa - score 47390.74094334523
	do sprawa - score 46330.491514938025
	w droga - score 32061.652843175118
	od dzień - score 31769.65610444706
subst - subst:
	droga rozporządzenie - score 54074.098685296
	skarb państwo - score 21933.42719414035
	rada minister - score 18307.859865915794
	terytorium rzeczpospolita - score 14282.498596292238
	ochrona środowisko - score 14029.89436117486
subst - adj:
	minister właściwy - score 71011.96509014728
	rzeczpospolita polski - score 43291.456752802434
	jednostka organizacyjny - score 24612.579437237084
	samorząd terytorialny - score 23394.073680596783
	produkt leczniczy - score 21913.30289879499
adj - subst:
	który mowa - score 249004.7889867908
	niniejszy ustawa - score 21508.981199226586
	następujący zmiana - score 18162.46590885003
	odrębny przepis - score 13058.75727457624
	walny zgromadzenie - score 9655.361888330419
subst - prep:
	mowa w - score 177749.45525470216
	ustawa z - score 42103

### Task 10

#### What types of bigrams have been found?
In eight out of ten cases one of the parts is a substantive, with either preposition, adjective, conjunction, gerund or another noun. The last two were prepositions with abbreviations (which are just shortened versions of nouns) or adjectives. So in these types substantives are the most common.

#### Which of the category-pairs indicate valuable multiword expressions? Do they have anything in common?
Substantive-substantive, substantive-adjective and adjective substantive. Expressions like "Skarb Państwa", "Rzeczpospolita Polska" or "walne zgromadzenie" give much information about texts' character and topic.  
All of them contain at least one noun.

#### Which signal: LLR score or syntactic category is more useful for determining genuine multiword expressions?
I think both were useful here. Syntactic categories provided variety in "types" of expressions, depending on text some pairs of categories might be more valuable than others. LLR score gave information about how often words appear with each other, without it we could only operate on number of occurences and we might miss many distinctive expressions, specific to processed text, because they would be buried under more general and popular expressions.

#### Can you describe a different use-case where the morphosyntactic category is useful for resolving a real-world problem?
Suggesting next word in phone's dictionaries. Autocompleting the whole sentence this way returns reasonable sentences already.