In [21]:
import ast
import math
import os

import conllu
import pandas as pd

<h4>Preprocess data and save to CSV</h4>

In [2]:
dir_path = 'PARSEME_1.2_Polish_Dataset/PL'
filename = 'train.cupt'

filepath = os.path.join(dir_path, filename)

In [21]:
with open(filepath, "r", encoding="utf-8") as f:
    sentence = next(conllu.parse_incr(f))
    tok = sentence[0]
    column_names = ['sent_id'] + list(tok.keys())
    
df = pd.DataFrame(columns = column_names)
    
with open(filepath, "r", encoding="utf-8") as f:
    sent_id = 0
    for sentence in conllu.parse_incr(f):
        sentence_rows = [{'sent_id': sent_id,
                          'id' : tok['id'],
                          'form': tok['form'],
                          'lemma': tok['lemma'],
                          'upos': tok['upos'],
                          'xpos': tok['xpos'],
                          'feats': tok['feats'],
                          'head': tok['head'],
                          'deprel': tok['deprel'],
                          'deps': tok['deps'],
                          'misc': tok['misc'],
                          'parseme:mwe': tok['parseme:mwe']} for tok in sentence]

        df = df.append(sentence_rows, ignore_index=True)

        sent_id += 1

df

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
0,0,1,Większym,duży,ADJ,adj:sg:inst:m3:com,"{'Animacy': 'Inan', 'Case': 'Ins', 'Degree': '...",2,amod,,,*
1,0,2,problemem,problem,NOUN,subst:sg:inst:m3,"{'Animacy': 'Inan', 'Case': 'Ins', 'Gender': '...",0,root,,,*
2,0,3,będzie,być,AUX,bedzie:sg:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",2,cop,,,*
3,0,4,znalezienie,znaleźć,NOUN,ger:sg:nom:n:perf:aff,"{'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'N...",2,nsubj,,,*
4,0,5,zawodnika,zawodnik,PROPN,subst:sg:gen:m1,"{'Animacy': 'Hum', 'Case': 'Gen', 'Gender': 'M...",4,obj,,{'SpaceAfter': 'No'},*
...,...,...,...,...,...,...,...,...,...,...,...,...
298432,17730,1,Gorąco,gorąco,ADV,adv:pos,{'Degree': 'Pos'},3,advmod,,,*
298433,17730,2,więc,więc,CCONJ,conj,,1,fixed,,,*
298434,17730,3,polecam,polecać,VERB,fin:sg:pri:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",0,root,,,*
298435,17730,4,kartkę,kartka,NOUN,subst:sg:acc:f,"{'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Si...",3,obj,,{'SpaceAfter': 'No'},*


In [22]:
# save dataframe to csv
df.to_csv(os.path.join(dir_path, 'train_df.tsv'), sep='\t', index=False)

In [12]:
# get tokens related to MWE
df[df['parseme:mwe'] != '*']

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
8,0,9,doprowadzenie,doprowadzić,NOUN,ger:sg:nom:n:perf:aff,"{'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'N...",2,nsubj,"[(nsubj, 2)]",,1:LVC.cause
9,0,10,do,do,ADP,prep:gen,{'AdpType': 'Prep'},12,case,"[(case, 12)]",{'Case': 'Gen'},1
12,0,13,chrztów,chrzest,NOUN,subst:pl:gen:m3,"{'Animacy': 'Inan', 'Case': 'Gen', 'Gender': '...",12,nmod:arg,"[(nmod:arg, 12)]",,1
43,1,14,cieszą,cieszyć,VERB,fin:pl:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",8,acl:relcl,,,1:IRV
44,1,15,się,się,PRON,qub,"{'PronType': 'Prs', 'Reflex': 'Yes'}",14,expl:pv,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...
73727,4388,4,relację,relacja,NOUN,subst:sg:acc:f,"{'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Si...",2,obj,"[(obj, 2)]",,1
73735,4389,5,udzielić,udzielić,VERB,inf:perf,"{'Aspect': 'Perf', 'VerbForm': 'Inf', 'Voice':...",2.0,xcomp,"[(xcomp, 2)]",,1:LVC.full
73738,4389,8,rad,rada,NOUN,subst:pl:gen:f,"{'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Pl...",5.0,obj,"[(obj, 5)]",{'SpaceAfter': 'No'},1
73744,4389,13,śmiał,śmiać,VERB,praet:sg:m1:imperf,"{'Animacy': 'Hum', 'Aspect': 'Imp', 'Gender': ...",10.0,ccomp,"[(ccomp, 10)]",,2:IRV


In [16]:
# get number of MWE instances
df[(df['parseme:mwe'] != '*') & (df['parseme:mwe'].str.len() > 1)]

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
8,0,9,doprowadzenie,doprowadzić,NOUN,ger:sg:nom:n:perf:aff,"{'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'N...",2,nsubj,"[(nsubj, 2)]",,1:LVC.cause
43,1,14,cieszą,cieszyć,VERB,fin:pl:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",8,acl:relcl,,,1:IRV
144,6,27,uśmiechał,uśmiechać,VERB,praet:sg:m1:imperf,"{'Animacy': 'Hum', 'Aspect': 'Imp', 'Gender': ...",2,conj,"[(root, 0), (conj, 2)]",,1:IRV
149,7,1,Przychodzą,przychodzić,VERB,fin:pl:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",0,root,"[(root, 0)]",,1:VID
304,15,10,brać,brać,VERB,inf:imperf,"{'Aspect': 'Imp', 'VerbForm': 'Inf', 'Voice': ...",9,xcomp,"[(xcomp, 9)]",,1:LVC.full
...,...,...,...,...,...,...,...,...,...,...,...,...
73645,4384,28,chlubi,chlubić,VERB,fin:sg:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",25,parataxis:insert,"[(parataxis:insert, 25)]",,2:IRV
73689,4386,17,był,być,VERB,praet:sg:m1:imperf,"{'Animacy': 'Hum', 'Aspect': 'Imp', 'Gender': ...",0.0,root,"[(root, 0)]",,1:VID
73725,4388,2,zdał,zdać,VERB,praet:sg:m1:perf,"{'Animacy': 'Hum', 'Aspect': 'Perf', 'Gender':...",0,root,"[(root, 0)]",,1:LVC.full
73735,4389,5,udzielić,udzielić,VERB,inf:perf,"{'Aspect': 'Perf', 'VerbForm': 'Inf', 'Voice':...",2.0,xcomp,"[(xcomp, 2)]",,1:LVC.full


In [18]:
# get number of MWE instances which occur with other MWEs in the same sentence
temp_df = df[(df['parseme:mwe'] != '*') & (df['parseme:mwe'].str.len() > 1)]

temp_df[temp_df.duplicated(subset='sent_id')]

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
1291,75,13,wprowadzono,wprowadzić,VERB,imps:perf,"{'Aspect': 'Perf', 'Mood': 'Ind', 'Person': '0...",7,acl:relcl,"[(acl:relcl, 7)]",,2:LVC.cause
4315,268,24,dokonanie,dokonanie,NOUN,ger:sg:acc:n:perf:aff,"{'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'N...",21,conj,,,2:LVC.full
4705,286,41,umowy,umowa,NOUN,subst:sg:gen:f,"{'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Si...",40,nmod:arg,"[(nmod:arg, 40)]",,2:LVC.full
4805,294,5,samookreśleniem,samookreślenie,NOUN,subst:sg:inst:n,"{'Case': 'Ins', 'Gender': 'Neut', 'Number': 'S...",3,nmod,,,2:IRV
4847,297,22,pochłonął,pochłonąć,VERB,praet:sg:m3:perf,"{'Animacy': 'Inan', 'Aspect': 'Perf', 'Gender'...",16,acl:relcl,"[(acl:relcl, 16)]",,2:LVC.full
...,...,...,...,...,...,...,...,...,...,...,...,...
71629,4263,24,mamy,mieć,VERB,fin:pl:pri:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",12,ccomp,"[(ccomp, 12)]",,2:VID
72645,4323,17,wchodząca,wchodzić,ADJ,pact:sg:nom:f:imperf:aff,"{'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Fe...",21,acl,,,2:VID
73288,4366,19,mają,mieć,VERB,fin:pl:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",3,conj,,,2:LVC.full
73645,4384,28,chlubi,chlubić,VERB,fin:sg:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",25,parataxis:insert,"[(parataxis:insert, 25)]",,2:IRV


<h4>Read train data CSV</h4>

In [3]:
# read test dataframe from TSV file
df_train = pd.read_csv(os.path.join(dir_path, 'train_df.tsv'), sep='\t')

df_train

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
0,0,1,Większym,duży,ADJ,adj:sg:inst:m3:com,"{'Animacy': 'Inan', 'Case': 'Ins', 'Degree': '...",2.0,amod,,,*
1,0,2,problemem,problem,NOUN,subst:sg:inst:m3,"{'Animacy': 'Inan', 'Case': 'Ins', 'Gender': '...",0.0,root,,,*
2,0,3,będzie,być,AUX,bedzie:sg:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",2.0,cop,,,*
3,0,4,znalezienie,znaleźć,NOUN,ger:sg:nom:n:perf:aff,"{'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'N...",2.0,nsubj,,,*
4,0,5,zawodnika,zawodnik,PROPN,subst:sg:gen:m1,"{'Animacy': 'Hum', 'Case': 'Gen', 'Gender': 'M...",4.0,obj,,{'SpaceAfter': 'No'},*
...,...,...,...,...,...,...,...,...,...,...,...,...
298432,17730,1,Gorąco,gorąco,ADV,adv:pos,{'Degree': 'Pos'},3.0,advmod,,,*
298433,17730,2,więc,więc,CCONJ,conj,,1.0,fixed,,,*
298434,17730,3,polecam,polecać,VERB,fin:sg:pri:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",0.0,root,,,*
298435,17730,4,kartkę,kartka,NOUN,subst:sg:acc:f,"{'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Si...",3.0,obj,,{'SpaceAfter': 'No'},*


In [47]:
# get number of potential incorrect MWEs
# df_non_mwe = df_test[df_test['parseme:mwe'] == '*']


# PoS types, wchich appear at least 50 times in train set
# verb + pron
# verb + noun
# adj + noun
# pron + verb
# verb + adp (przyimek)
# adj + pron
# noun + verb
# noun + noun
# noun + adj
# adp + noun
# noun + pron

incorr_mwe_df = pd.DataFrame(columns=['type', 'first_word', 'second_word'])

correct_pos = ['VERB+PRON', 'VERB+NOUN', 'ADJ+NOUN', 'VERB+ADP', 'ADJ+PRON', 
               'NOUN+VERB', 'NOUN+NOUN', 'NOUN+ADJ', 'ADP+NOUN', 'NOUN+PRON']

mwe_count = 0
mwe_count_dict = {}

for sent_id in df_train['sent_id'].unique():
    pos_list = df_train[df_train['sent_id'] == sent_id]['upos'].tolist()
    idx_list = df_train[df_train['sent_id'] == sent_id]['id'].tolist()
    form_list = df_train[df_train['sent_id'] == sent_id]['form'].tolist()
    mwe_tag_list = df_train[df_train['sent_id'] == sent_id]['parseme:mwe'].tolist()
    deprel_list = df_train[df_train['sent_id'] == sent_id]['deprel'].tolist()
    deps_list = [ast.literal_eval(elem) if type(elem) == str else [] for elem in df_train[df_train['sent_id'] == sent_id]['deps'].tolist()]
    # print(f'Ids of words: {df_train[df_train["sent_id"] == sent_id]["id"].tolist()}')
    
    for pos_ind, pos in enumerate(pos_list[:-1]):
        first_pos = pos
        second_pos = pos_list[pos_ind + 1]
        mwe_pos = f'{first_pos}+{second_pos}'
        
        first_word_deps = [elem[1] for elem in deps_list[pos_ind]]
        second_word_deps = [elem[1] for elem in deps_list[pos_ind + 1]]
        
        if mwe_pos in correct_pos and mwe_tag_list[pos_ind] == '*' and mwe_tag_list[pos_ind + 1 ] == '*' and (int(idx_list[pos_ind]) in second_word_deps or int(idx_list[pos_ind + 1]) in first_word_deps):
            mwe_count += 1
            
            # print every 1000th MWE
            # if mwe_count % 1000 == 0:
            #     print(f"{mwe_pos}\t{idx_list[pos_ind]}\t{form_list[pos_ind]}\t{deps_list[pos_ind]}\t{idx_list[pos_ind+1]}\t{form_list[pos_ind + 1]}\t{deps_list[pos_ind+1]}")
            
            # add MWE count to the dictionary
            if mwe_pos not in mwe_count_dict.keys():
                mwe_count_dict[mwe_pos] = 1
                
            else:
                mwe_count_dict[mwe_pos] += 1
                
            # append MWE to DataFrame
            incorr_mwe_df = incorr_mwe_df.append({'type': mwe_pos, 'first_word': form_list[pos_ind], 'second_word': form_list[pos_ind + 1]}, ignore_index=True)

# save DataFrame to TSV
incorr_mwe_df.to_csv('parseme_incorrect_mwes.tsv', sep='\t', index=False)
            
print(f'MWE count: {mwe_count}')
mwe_count_dict

MWE count: 29027


{'VERB+PRON': 1907,
 'ADP+NOUN': 7968,
 'NOUN+NOUN': 4485,
 'ADJ+NOUN': 6152,
 'NOUN+ADJ': 3407,
 'VERB+NOUN': 2647,
 'NOUN+VERB': 2260,
 'VERB+ADP': 34,
 'NOUN+PRON': 77,
 'ADJ+PRON': 90}

In [48]:
incorr_mwe_df

Unnamed: 0,type,first_word,second_word
0,VERB+PRON,mam,nikogo
1,ADP+NOUN,Oprócz,tysięcy
2,NOUN+NOUN,tysięcy,traktorów
3,ADP+NOUN,dla,opozycji
4,ADJ+NOUN,najnowszą,techniką
...,...,...,...
29022,VERB+NOUN,Unikali,patrzenia
29023,NOUN+PRON,patrzenia,sobie
29024,ADP+NOUN,w,oczy
29025,ADJ+NOUN,metaliczny,smak


In [19]:
[ast.literal_eval(elem) for elem in df_train[df_train['sent_id'] == 5]['deps'].tolist()]

[[('root', 0)],
 [('advmod:emph', 1)],
 [('punct', 11)],
 [('mark', 11)],
 [('case', 6)],
 [('obl', 11)],
 [('case', 8)],
 [('nmod', 6)],
 [('nsubj', 11)],
 [('nmod', 9)],
 [('ccomp', 1)],
 [('xcomp', 11)],
 [('obj', 12)],
 [('case', 16), ('case', 18)],
 [('amod:flat', 16)],
 [('obl', 12)],
 [('cc', 18)],
 [('obl', 12), ('conj', 16)],
 [('punct', 29)],
 [('advmod', 29)],
 [('punct', 23)],
 [('nmod:poss', 23)],
 [('parataxis:insert', 29)],
 [('punct', 23)],
 [('nsubj', 29)],
 [('cop', 29)],
 [('case', 29)],
 [('fixed', 27)],
 [('acl:relcl', 16), ('acl:relcl', 18)],
 [('case', 31)],
 [('obl', 29)],
 [('amod', 31)],
 [('punct', 1)]]

In [5]:
# get number of MWE instances
df_train[(df_train['parseme:mwe'] != '*') & (df_train['parseme:mwe'].str.len() > 1)]

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
177,8,11,wpadam,wpadać,VERB,fin:sg:pri:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",0.0,root,"[('root', 0)]",,1:LVC.full
184,9,3,znajduje,znajdować,VERB,fin:sg:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",0.0,root,,,1:IRV
301,14,39,pełniła,pełnić,VERB,praet:sg:f:imperf,"{'Aspect': 'Imp', 'Gender': 'Fem', 'Mood': 'In...",13.0,conj,"[('conj', 5), ('conj', 13)]",,1:LVC.full
340,16,14,trzymająca,trzymać,ADJ,pact:sg:nom:f:imperf:aff,"{'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Fe...",13.0,acl,"[('acl', 13)]",,1:LVC.full
404,18,54,się,się,PRON,part,"{'PronType': 'Prs', 'Reflex': 'Yes'}",55.0,expl:pv,"[('expl:pv', 55)]",,1:IRV
...,...,...,...,...,...,...,...,...,...,...,...,...
298272,17718,9,czują,czuć,VERB,fin:pl:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",1.0,conj,"[('root', 0), ('conj', 1)]",,1:IRV
298285,17719,5,ankiety,ankieta,NOUN,subst:sg:gen:f,"{'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Si...",4.0,nmod,,,1:LVC.full
298298,17720,2,Widzę,widzieć,VERB,fin:sg:pri:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",0.0,root,,,1:VID
298337,17723,9,starała,starać,VERB,praet:sg:f:imperf,"{'Aspect': 'Imp', 'Gender': 'Fem', 'Mood': 'In...",6.0,ccomp:cleft,"[('ccomp:cleft', 6)]",,1:IRV


In [6]:
# check if there are any MWEs without VERB
df_train[(df_train['parseme:mwe'] != '*')]
nonverb_mwe_count = 0

for sent_id in df_train['sent_id'].unique():
    pos_list = df_train[df_train['sent_id'] == sent_id]['upos'].tolist()
    mwe_tag_list = df_train[df_train['sent_id'] == sent_id]['parseme:mwe'].tolist()
    
    for pos_ind, pos in enumerate(pos_list[:-1]):
        if (pos != 'VERB' and pos_list[pos_ind + 1] != 'VERB') and mwe_tag_list[pos_ind] != '*' and mwe_tag_list[pos_ind + 1] != '*':
            nonverb_mwe_count += 1
            
nonverb_mwe_count

1026

In [9]:
# check if there are any MWEs with two NOUNs
df_train[(df_train['parseme:mwe'] != '*')]
mwe_count_dict = {}

for sent_id in df_train['sent_id'].unique():
    pos_list = df_train[df_train['sent_id'] == sent_id]['upos'].tolist()
    mwe_tag_list = df_train[df_train['sent_id'] == sent_id]['parseme:mwe'].tolist()
    mwe_component_ind = [False for _ in range(len(pos_list))]
    
    for pos_ind, pos in enumerate(pos_list[:-1]):
        if mwe_component_ind[pos_ind]:
            continue

        if mwe_tag_list[pos_ind] != '*' and mwe_tag_list[pos_ind + 1] != '*':
            mwe_type = f'{pos}+{pos_list[pos_ind + 1]}'
            
            if mwe_type not in mwe_count_dict.keys():
                mwe_count_dict[mwe_type] = 1
                
            else:
                mwe_count_dict[mwe_type] += 1
                
            mwe_component_ind[pos_ind] = True
            mwe_component_ind[pos_ind + 1] = True
            
mwe_count_dict

{'VERB+ADP': 217,
 'VERB+PRON': 1949,
 'VERB+NOUN': 738,
 'ADJ+NOUN': 70,
 'PRON+VERB': 286,
 'PART+VERB': 47,
 'ADJ+PRON': 88,
 'NOUN+VERB': 113,
 'NOUN+NOUN': 131,
 'ADP+ADJ': 11,
 'NOUN+ADJ': 49,
 'ADP+NOUN': 117,
 'VERB+PROPN': 12,
 'ADJ+ADP': 32,
 'NOUN+ADP': 21,
 'VERB+AUX': 1,
 'ADJ+ADJ': 1,
 'VERB+ADV': 5,
 'ADP+PART': 1,
 'DET+NOUN': 2,
 'NOUN+PRON': 52,
 'VERB+ADJ': 13,
 'AUX+NOUN': 3,
 'NOUN+PROPN': 18,
 'VERB+SCONJ': 3,
 'VERB+PART': 3,
 'AUX+AUX': 2,
 'ADP+ADV': 2,
 'ADP+PRON': 8,
 'AUX+ADP': 3,
 'PRON+PART': 2,
 'PROPN+ADJ': 4,
 'NOUN+PART': 1,
 'VERB+VERB': 1,
 'PROPN+PRON': 2,
 'ADV+PRON': 4,
 'NOUN+CCONJ': 1,
 'ADP+PROPN': 5,
 'PART+PRON': 4,
 'PRON+NOUN': 3,
 'SCONJ+NOUN': 3,
 'ADV+ADJ': 1,
 'ADV+ADV': 1,
 'ADV+VERB': 6,
 'PUNCT+NOUN': 1,
 'CCONJ+ADP': 1,
 'PRON+ADP': 3,
 'ADV+NOUN': 1,
 'PROPN+VERB': 2,
 'CCONJ+PRON': 1,
 'VERB+CCONJ': 1,
 'VERB+DET': 2,
 'ADP+DET': 2,
 'SCONJ+ADJ': 1,
 'ADJ+PROPN': 1,
 'VERB+PUNCT': 3,
 'SCONJ+VERB': 2,
 'ADV+ADP': 1,
 'SCONJ+PRON':

In [10]:
correct_mwe_count = sum([elem for elem in mwe_count_dict.values()])

correct_mwe_count

4064