1. Download [docker image](https://hub.docker.com/r/djstrong/krnnt2) of KRNNT2. It includes the following tools:
   1. Morfeusz2 - morphological dictionary
   1. Corpus2 - corpus access library
   1. Toki - tokenizer for Polish
   1. Maca - morphosyntactic analyzer
   1. rknnt - Polish tagger
1. Use the tool to tag and lemmatize the corpus with the bills.
1. Using the tagged corpus compute bigram statistic for the tokens containing:
   1. lemmatized, downcased word
   1. morphosyntactic category of the word (noun, verb, etc.)
1. Exclude bigram containing non-words (such as numbers, punctuation, etc.)
1. For example: "Ala ma kota", which is tagged as:
   ```
   Ala	none
           Ala	subst:sg:nom:f	disamb
   ma	space
           mieć	fin:sg:ter:imperf	disamb
   kota	space
           kot	subst:sg:acc:m2	disamb
   .	none
           .	interp	disamb
   ```
   the algorithm should return the following bigrams: `ala:subst mieć:fin` and `mieć:fin kot:subst`.
1. Compute LLR statistic for this dataset.
1. Select top 50 results including noun at the first position and noun or adjective at the second position.

In [46]:
import os
import requests
import regex
from collections import Counter
import math

def filesNames():
    path = '../ustawy'
    absolute_path = os.path.realpath(path) + "\\"
    return [(absolute_path + filename, filename) for filename in os.listdir(path)]

def getFileTextRaw(path):
    with open(path, 'r', encoding="utf8") as content_file:
        return content_file.read()
    
def cleanText(text):
    lower_text = text.replace('-\n', '').replace('\n', ' ').lower()
    unpunct_text = regex.sub(r"\p{P}", ' ' ,lower_text)
    without_empty_text = regex.sub(r"\s+", ' ' ,unpunct_text).strip()
    return without_empty_text

def lematText(text):
    url = 'http://localhost:9200'
    data = text.encode('utf-8')
    return requests.post(url=url, data=data).content.decode('utf-8')

def splitWords(lemat_text):
    words = []
#     assertion that there is always 4
#     prev = 0
    for line in lemat_text.splitlines():
        splitted = line.split('\t')
        splitted_len = len(splitted)
        if splitted_len == 4:
            words.append((splitted[1], splitted[2].split(':')[0]))
#         if prev == splitted_len:
#             print(splitted, splitted_len)
#             break
#         prev = len(splitted)
    return words

def correctWord(word):
    return word[0].isalpha()

def countsBigrams(words, bigram_counter, left_counter, right_counter):
    bigram_size = 0
    for index, right_word in enumerate(words[1:]):
        left_word = words[index-1]
        if correctWord(right_word) and correctWord(left_word):
            bigram_counter[(left_word, right_word)] += 1
            left_counter[left_word] +=1
            right_counter[right_word] +=1
            bigram_size += 1
    return bigram_size

def shannon(values, N):
    return sum([k / N * math.log(k/N + (k == 0)) for k in values])

def llr(bigram, val, left_counter, right_counter, bigram_size):
    k_11 = val
    k_12 = right_counter[bigram[1]] - val
    k_21 = left_counter[bigram[0]] - val
    k_22 = bigram_size - right_counter[bigram[1]] - left_counter[bigram[0]] + val
    return 2 * bigram_size * (shannon([k_11, k_12, k_21, k_22], bigram_size) - shannon([k_11 + k_12, k_21 + k_22], bigram_size) - shannon([k_11 + k_21, k_12 + k_22], bigram_size))


In [47]:
bigram_counter = Counter()
left_counter = Counter()
right_counter = Counter()
bigram_size = 0
for (path, filename) in filesNames():
    content = getFileTextRaw(path)
    clean_text = cleanText(content)
    lemat_text = lematText(clean_text)
    words = splitWords(lemat_text)
    bigram_size += countsBigrams(words, bigram_counter, left_counter, right_counter)

In [48]:
llr_values = []
for bigram, val in bigram_counter.items():
    llr_values.append((bigram, llr(bigram, val, left_counter, right_counter, bigram_size)))
    
llr_values.sort(key=lambda x: -x[1])

In [49]:
filter_llr_values = list(filter(lambda x: x[0][0][1] == 'subst' and x[0][1][1] in ['subst', 'adj'], llr_values))
filter_llr_values[:50]

[((('mowa', 'subst'), ('art', 'subst')), 66073.48468025675),
 ((('ustawa', 'subst'), ('dzień', 'subst')), 55238.90386364769),
 ((('art', 'subst'), ('usta', 'subst')), 50781.10927832162),
 ((('mowa', 'subst'), ('ust', 'subst')), 29221.301698348656),
 ((('mowa', 'subst'), ('usta', 'subst')), 19532.092745820457),
 ((('dzień', 'subst'), ('grudzień', 'subst')), 18010.25392135024),
 ((('dzień', 'subst'), ('styczeń', 'subst')), 16750.926922731625),
 ((('jednostka', 'subst'), ('terytorialny', 'adj')), 13286.426732486218),
 ((('dzień', 'subst'), ('czerwiec', 'subst')), 13242.387222237949),
 ((('kara', 'subst'), ('wolność', 'subst')), 13136.475933608664),
 ((('terytorium', 'subst'), ('polski', 'adj')), 12776.67696679245),
 ((('porozumienie', 'subst'), ('minister', 'subst')), 12214.604681474128),
 ((('dzień', 'subst'), ('lipiec', 'subst')), 12204.146048841663),
 ((('termin', 'subst'), ('dzień', 'subst')), 10651.33034336112),
 ((('dzień', 'subst'), ('sierpień', 'subst')), 10244.026619823593),
 (((