# Morphosyntactic tagging


## Solution of https://github.com/apohllo/nlp/blob/master/5-tagging.md

In [12]:
import requests
from os import listdir
from os.path import isfile, join

In [13]:
url = "http://localhost:9200"

In [32]:
all_bills = []
def load_bills(directory_path: str):
    files = [file for file in listdir(directory_path) if isfile(join(directory_path, file))]
    for file in files:
        with open(f"{directory_path}/{file}") as input_file:
            corpus =input_file.read()
            r = requests.post(url, corpus.encode("utf-8"))
            all_bills.append(r.text)

In [33]:
load_bills('./data/')

In [27]:
import pickle
def serialize(name, dictionary):
    with open(name, 'wb') as handle:
        pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)
def deserialize(name):
    with open(name, 'rb') as handle:
        return pickle.load(handle)

In [34]:
serialize('all_bills_tag.pickle', all_bills)

In [38]:
print(all_bills[0])

Dz	none
	dziennik	brev:pun	disamb
.	none
	.	interp	disamb
U	none
	ustawa	brev:pun	disamb
.	none
	.	interp	disamb
z	space
	z	prep:gen:nwok	disamb
2003	space
	2003	adj:sg:gen:m3:pos	disamb
r	space
	rok	brev:pun	disamb
.	none
	.	interp	disamb

Nr	newline
	numer	brev:npun	disamb
228	space
	228	num:pl:nom:m3:rec	disamb
,	none
	,	interp	disamb
poz	space
	pozycja	brev:pun	disamb
.	none
	.	interp	disamb
2259	space
	2259	num:pl:nom:m3:rec	disamb

U	none
	u	prep:gen	disamb
S	space
	Solidarność	brev:npun	disamb
T	space
	tesla	brev:npun	disamb
A	space
	a	conj	disamb
W	space
	w	prep:loc:nwok	disamb
A	space
	A	subst:sg:loc:n	disamb

z	none
	z	prep:gen:nwok	disamb
dnia	space
	dzień	subst:sg:gen:m3	disamb
12	space
	12	adj:sg:gen:m3:pos	disamb
grudnia	space
	grudzień	subst:sg:gen:m3	disamb
2003	space
	2003	adj:sg:gen:m3:pos	disamb
r	space
	rok	brev:pun	disamb
.	none
	.	interp	disamb

o	none
	o	prep:loc	disamb
zmianie	space
	zmiana	subst:sg:loc:f	disamb
ustawy	space
	ustawa	subst:sg:gen:f	disamb
-	space

In [81]:
big_list = []
for  bill in all_bills:
    big_list.append(parse_bils(bill))

In [76]:
def parse_bils(corpus):
    lines = corpus.splitlines()
    pairs_list = []
    for i in range(0, len(lines)):
        if len(lines[i].split('\t')) == 2:
            pairs = parse_next_line(lines[i+1])
            pairs_list.append(pairs)
    return pairs_list

In [77]:
def parse_next_line(line):
    parts = line.split("\t")
    if len(parts) > 2:
        first = parts[1]
        second = parts[2]
        tags = second.split(':')
        if len(tags) > 1:
            return(f'{first}:{tags[0]}')
        return(f'{first}:{second}')

In [110]:
bigrams = {}

In [111]:
def bigram_count():
    for element in big_list:
        i = 0
        j = 1
        while i < len(element) and  j<len(element):
            e1 = element[i]
            e2 = element[j]
            if not is_alpha(e1):
                i+=1
                j+=1
                continue
            elif not is_alpha(e2):
                i+=2
                j+=2
                continue
            else:
                bigram = f'{e1.lower()} {e2.lower()}'
                if bigram not in bigrams:
                    bigrams[bigram] = 1
                else:
                    count = bigrams[bigram]
                    count +=1
                    bigrams[bigram] = count
                i+=1
                j+=1
                

In [112]:
def is_alpha(word):
    return word.split(":")[0].isalpha() 

In [113]:
bigram_count()

In [122]:
unigrams = {}

In [123]:
for bill in big_list:
    for word in bill:
        if is_alpha(word):
            if word not in unigrams:
                unigrams[word.lower()] = 1
            else:
                count = unigrams[word.lower()]
                count +=1
                unigrams[word.lower()] = count

In [127]:
from llr import llr_2x2
llr_dict = {}
for bigram in bigrams:
    try:
        word1, word2 = bigram.split(' ')
        not_word1_word2 = unigrams[word2] - bigrams[bigram] 
        word1_not_word2 = unigrams[word1] - bigrams[bigram]
        not_word1_not_word2 = sum(bigrams.values()) - word1_not_word2 - not_word1_word2 - bigrams[bigram]
        llr_dict[bigram] = llr_2x2(bigrams[bigram],not_word1_word2, word1_not_word2, not_word1_not_word2 )
    except:
        continue

In [129]:
sorted_llr = sorted(llr_dict.items(), key=lambda entry: entry[1] , reverse=True)


In [130]:
sorted_llr[:50]

[('który:adj mowa:subst', 248324.4349527203),
 ('o:prep który:adj', 163664.93922446412),
 ('mowa:subst w:prep', 149503.9146878908),
 ('otrzymywać:fin brzmienie:subst', 111106.1820813932),
 ('w:prep artykuł:brev', 67791.56558237551),
 ('dodawać:fin się:qub', 66927.52216122783),
 ('w:prep ustęp:brev', 56433.33445513528),
 ('stosować:fin się:qub', 52971.23413636157),
 ('droga:subst rozporządzenie:subst', 51583.88981686512),
 ('właściwy:adj do:prep', 47904.17674144334),
 ('na:prep podstawa:subst', 45476.403285408625),
 ('do:prep sprawa:subst', 45253.133482930134),
 ('z:prep dzień:subst', 43200.15806660266),
 ('określić:ppas w:prep', 43090.76382959541),
 ('móc:fin być:inf', 41374.2576458441),
 ('a:conj także:conj', 40332.86446757338),
 ('zastępować:fin się:qub', 39196.44044670282),
 ('się:qub wyraz:subst', 33870.96200476226),
 ('w:prep w:prep', 31862.451481773518),
 ('w:prep droga:subst', 31360.723432420986),
 ('od:prep dzień:subst', 29491.490230257157),
 ('ustawa:subst z:prep', 29220.10635