<h3>Morphosyntactic tagging</h3>
<p>Konrad Przewłoka</p>

<h4>Necessary imports</h4>

In [86]:
import os
import collections
import requests
import math
from prettytable import PrettyTable

<h4>Load data</h4>

In [6]:
data=[]
files = os.listdir("../ustawy")
for file in files:
    with open("../ustawy" + '/' + file, 'r', encoding='utf8') as f:
        tmp = f.read().lower()
        data.append(tmp)

<h4>Tagowanie i lematyzacja</h4>

In [7]:
taged_data = [(lambda text: requests.post("http://localhost:9200", data=text.encode("utf-8")).content.decode("utf-8"))(text) for text in data]

<h4>Bigram computation</h4>

In [62]:
tokenized_data=[]
for file in taged_data:
    lines = file.split("\n")
    splits = [line.strip().split('\t') for line in lines if line.startswith("\t")]
    tokens = [(split[0].lower(),split[1].split(":")[0]) for split in splits if len(split)>=2]
    tokenized_data.append(tokens)

bigrams = [] 
for file in tokenized_data:
    prev_token = None
    file_bigrams=[]
    for token in file:
        if prev_token!=None:
            file_bigrams.append((prev_token,token))
        prev_token=token
    bigrams.append(file_bigrams)
tokens_total=[]    
for file in tokenized_data:
    for token in file:
        if token[0].isalpha():
            tokens_total.append(token)

tokens_counter= collections.Counter(tokens_total)
bigram_counter= collections.Counter([element for sublist in bigrams for element in sublist])

tmp = []
for key in bigram_counter.keys():
    if not key[0][0].isalpha() or not key[1][0].isalpha():
        tmp.append(key)
        
for key in tmp:
    del bigram_counter[key]

tokens_total = sum(tokens_counter.values())

<h4>LLR for bigrams</h4>

In [65]:
def h(ks):
    total = float(sum(ks))
    return sum([k/total * math.log(k / total + (k==0)) for k in ks])

def llr(bigram):
    k11= bigram_counter[bigram]
    k12= tokens_counter[bigram[1]]-bigram_counter[bigram]
    k21= tokens_counter[bigram[0]]-bigram_counter[bigram]
    k22= tokens_total-k11-k12-k21
    return 2*sum([k11,k12,k21,k22])*(h([k11+k12,k21+k22])-h([k11+k21,k12+k22]))

bigrams_llr={bigram: llr(bigram) for bigram  in bigram_counter.keys()}
collections.Counter(bigrams_llr).most_common()[:10] 

[((('w', 'prep'), ('oczyszczal', 'subst')), 1557988.463414508),
 ((('w', 'prep'), ('nieszczelny', 'adj')), 1557988.463414508),
 ((('w', 'prep'), ('jedność', 'subst')), 1557988.463414508),
 ((('w', 'prep'), ('spisywać', 'ppas')), 1557988.463414508),
 ((('w', 'prep'), ('rio', 'subst')), 1557988.463414508),
 ((('w', 'prep'), ('dwuosobowy', 'adj')), 1557988.463414508),
 ((('w', 'prep'), ('dyskusja', 'subst')), 1557988.463414508),
 ((('w', 'prep'), ('czterobrygadowej', 'adj')), 1557988.463414508),
 ((('w', 'prep'), ('wyścigi', 'subst')), 1557988.463414508),
 ((('w', 'prep'), ('kontrakty', 'subst')), 1557988.463414508)]

<h4>Syntactic categories split of bigrams</h4>

In [81]:
splits={}
tmp_count=[]
for bigram, count in bigram_counter.items():
    tmp_count.append((bigram[0][1],bigram[1][1]))
    if not (bigram[0][1],bigram[1][1]) in splits:
        splits[(bigram[0][1],bigram[1][1])]=[(bigram,count)]
    else:
        splits[(bigram[0][1],bigram[1][1])].append((bigram,count))
    

    
top_splits = collections.Counter(tmp_count).most_common()[:10]
top_splits

[(('subst', 'subst'), 47863),
 (('subst', 'adj'), 27133),
 (('adj', 'subst'), 26138),
 (('subst', 'fin'), 16138),
 (('ger', 'subst'), 15579),
 (('prep', 'subst'), 12282),
 (('subst', 'prep'), 11349),
 (('subst', 'ppas'), 10699),
 (('fin', 'subst'), 8806),
 (('adj', 'fin'), 8695)]

<h4>Top 5 LLR bigrams for 10 largests splits</h4>

In [91]:
top_five={}
random_five={}
for split in top_splits:
    top_five_llr[split[0]]=collections.Counter({bigram:bigrams_llr[bigram] for bigram,count in splits[split[0]]}).most_common()[:5]
for split in top_splits:
    print("Split: "+str(split[0]))
    for bigram,llr in top_five_llr[split[0]]:
        print(str(bigram)+" LLR:"+str(llr))

Split: ('subst', 'subst')
(('dzień', 'subst'), ('immatrykulacja', 'subst')) LLR:326336.9651891766
(('dzień', 'subst'), ('pięćdziesiątnica', 'subst')) LLR:326336.9651891766
(('dzień', 'subst'), ('upra', 'subst')) LLR:326336.9651891766
(('dzień', 'subst'), ('dora', 'subst')) LLR:326336.9651891766
(('dzień', 'subst'), ('opróż', 'subst')) LLR:326336.9651891766
Split: ('subst', 'adj')
(('dzień', 'subst'), ('powszedni', 'adj')) LLR:326336.9651891766
(('dzień', 'subst'), ('zdjęciowy', 'adj')) LLR:326307.5545230917
(('dzień', 'subst'), ('samowolny', 'adj')) LLR:326171.1084107933
(('dzień', 'subst'), ('świąteczny', 'adj')) LLR:326144.9534819157
(('dzień', 'subst'), ('siódmy', 'adj')) LLR:325725.5110191203
Split: ('adj', 'subst')
(('który', 'adj'), ('rych', 'subst')) LLR:521164.20924725314
(('który', 'adj'), ('ustalania', 'subst')) LLR:521164.20924725314
(('który', 'adj'), ('wygrany', 'subst')) LLR:521164.20924725314
(('który', 'adj'), ('ustąpienie', 'subst')) LLR:521164.20924725314
(('który', '

<h4>Answers</h4>
<h5>I</h5>
<p>Most common found pairs of words are of type noun and noun, noun and ajective, djective and noun. Almost all of the top 10 splits contain one noun per pair.</p>
<h5>II</h5>
<p>In my opininon the most useful bigrams seem to be parts of the ('subst', 'adj') and the ('subst', 'subst') partitions as the seem to descibe entities such as certain specific dates (ex. "dzień pięćdziesiątnicy")</p>
<h5>III</h5>
<p>I think that only a combination of both LLR and syntactic category can provide useful and genuine multiword expressions.</p>
<h5>IV</h5>
<p>Morphosyntactic categorization can be usefule when trying to recognize names of entities in texts and in Co-reference Resolution</p>