In [4]:
import ast
import math
import os

import conllu
import pandas as pd

<h4>Preprocess data and save to CSV</h4>

In [2]:
dir_path = os.path.join('..', 'storage', 'parseme', 'pl', 'raw_data')
filename = 'dev.cupt'

filepath = os.path.join(dir_path, filename)

In [4]:
with open(filepath, "r", encoding="utf-8") as f:
    sentence = next(conllu.parse_incr(f))
    tok = sentence[0]
    column_names = ['sent_id'] + list(tok.keys())
    
df = pd.DataFrame(columns = column_names)
    
with open(filepath, "r", encoding="utf-8") as f:
    sent_id = 0
    for sentence in conllu.parse_incr(f):
        sentence_rows = [{'sent_id': sent_id,
                          'id' : tok['id'],
                          'form': tok['form'],
                          'lemma': tok['lemma'],
                          'upos': tok['upos'],
                          'xpos': tok['xpos'],
                          'feats': tok['feats'],
                          'head': tok['head'],
                          'deprel': tok['deprel'],
                          'deps': tok['deps'],
                          'misc': tok['misc'],
                          'parseme:mwe': tok['parseme:mwe']} for tok in sentence]

        df = df.append(sentence_rows, ignore_index=True)

        sent_id += 1

df

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
0,0,1,Szczególna,szczególny,ADJ,adj:sg:nom:f:pos,"{'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fe...",2,amod,,,*
1,0,2,uwaga,uwaga,NOUN,subst:sg:nom:f,"{'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Si...",3,nsubj,,,*
2,0,3,ma,mieć,VERB,fin:sg:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",0,root,,,*
3,0,4,być,być,VERB,inf:imperf,"{'Aspect': 'Imp', 'VerbForm': 'Inf', 'Voice': ...",3,xcomp,,,*
4,0,5,poświęcona,poświęcić,ADJ,ppas:sg:nom:f:perf:aff,"{'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'F...",4,xcomp,,,*
...,...,...,...,...,...,...,...,...,...,...,...,...
23945,1424,30,na,na,ADP,prep:acc,{'AdpType': 'Prep'},31,case,"[(case, 31)]",{'Case': 'Acc'},*
23946,1424,31,zawsze,zawsze,ADV,adv,{'PronType': 'Tot'},28,advmod,"[(advmod, 28)]",,*
23947,1424,32,jego,on,PRON,ppron3:sg:gen:m1:ter:akc:npraep,"{'Animacy': 'Hum', 'Case': 'Gen', 'Gender': 'M...",33,nmod,"[(nmod, 33)]",,*
23948,1424,33,tajemnicą,tajemnica,NOUN,subst:sg:inst:f,"{'Case': 'Ins', 'Gender': 'Fem', 'Number': 'Si...",28,xcomp:pred,"[(xcomp:pred, 28)]",{'SpaceAfter': 'No'},*


In [7]:
# save dataframe to csv
df.to_csv(os.path.join(dir_path, f'{filename.split(".")[0]}_df.tsv'), sep='\t', index=False)

In [12]:
# get tokens related to MWE
df[df['parseme:mwe'] != '*']

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
8,0,9,doprowadzenie,doprowadzić,NOUN,ger:sg:nom:n:perf:aff,"{'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'N...",2,nsubj,"[(nsubj, 2)]",,1:LVC.cause
9,0,10,do,do,ADP,prep:gen,{'AdpType': 'Prep'},12,case,"[(case, 12)]",{'Case': 'Gen'},1
12,0,13,chrztów,chrzest,NOUN,subst:pl:gen:m3,"{'Animacy': 'Inan', 'Case': 'Gen', 'Gender': '...",12,nmod:arg,"[(nmod:arg, 12)]",,1
43,1,14,cieszą,cieszyć,VERB,fin:pl:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",8,acl:relcl,,,1:IRV
44,1,15,się,się,PRON,qub,"{'PronType': 'Prs', 'Reflex': 'Yes'}",14,expl:pv,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...
73727,4388,4,relację,relacja,NOUN,subst:sg:acc:f,"{'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Si...",2,obj,"[(obj, 2)]",,1
73735,4389,5,udzielić,udzielić,VERB,inf:perf,"{'Aspect': 'Perf', 'VerbForm': 'Inf', 'Voice':...",2.0,xcomp,"[(xcomp, 2)]",,1:LVC.full
73738,4389,8,rad,rada,NOUN,subst:pl:gen:f,"{'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Pl...",5.0,obj,"[(obj, 5)]",{'SpaceAfter': 'No'},1
73744,4389,13,śmiał,śmiać,VERB,praet:sg:m1:imperf,"{'Animacy': 'Hum', 'Aspect': 'Imp', 'Gender': ...",10.0,ccomp,"[(ccomp, 10)]",,2:IRV


In [16]:
# get number of MWE instances
df[(df['parseme:mwe'] != '*') & (df['parseme:mwe'].str.len() > 1)]

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
8,0,9,doprowadzenie,doprowadzić,NOUN,ger:sg:nom:n:perf:aff,"{'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'N...",2,nsubj,"[(nsubj, 2)]",,1:LVC.cause
43,1,14,cieszą,cieszyć,VERB,fin:pl:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",8,acl:relcl,,,1:IRV
144,6,27,uśmiechał,uśmiechać,VERB,praet:sg:m1:imperf,"{'Animacy': 'Hum', 'Aspect': 'Imp', 'Gender': ...",2,conj,"[(root, 0), (conj, 2)]",,1:IRV
149,7,1,Przychodzą,przychodzić,VERB,fin:pl:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",0,root,"[(root, 0)]",,1:VID
304,15,10,brać,brać,VERB,inf:imperf,"{'Aspect': 'Imp', 'VerbForm': 'Inf', 'Voice': ...",9,xcomp,"[(xcomp, 9)]",,1:LVC.full
...,...,...,...,...,...,...,...,...,...,...,...,...
73645,4384,28,chlubi,chlubić,VERB,fin:sg:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",25,parataxis:insert,"[(parataxis:insert, 25)]",,2:IRV
73689,4386,17,był,być,VERB,praet:sg:m1:imperf,"{'Animacy': 'Hum', 'Aspect': 'Imp', 'Gender': ...",0.0,root,"[(root, 0)]",,1:VID
73725,4388,2,zdał,zdać,VERB,praet:sg:m1:perf,"{'Animacy': 'Hum', 'Aspect': 'Perf', 'Gender':...",0,root,"[(root, 0)]",,1:LVC.full
73735,4389,5,udzielić,udzielić,VERB,inf:perf,"{'Aspect': 'Perf', 'VerbForm': 'Inf', 'Voice':...",2.0,xcomp,"[(xcomp, 2)]",,1:LVC.full


In [18]:
# get number of MWE instances which occur with other MWEs in the same sentence
temp_df = df[(df['parseme:mwe'] != '*') & (df['parseme:mwe'].str.len() > 1)]

temp_df[temp_df.duplicated(subset='sent_id')]

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
1291,75,13,wprowadzono,wprowadzić,VERB,imps:perf,"{'Aspect': 'Perf', 'Mood': 'Ind', 'Person': '0...",7,acl:relcl,"[(acl:relcl, 7)]",,2:LVC.cause
4315,268,24,dokonanie,dokonanie,NOUN,ger:sg:acc:n:perf:aff,"{'Aspect': 'Perf', 'Case': 'Acc', 'Gender': 'N...",21,conj,,,2:LVC.full
4705,286,41,umowy,umowa,NOUN,subst:sg:gen:f,"{'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Si...",40,nmod:arg,"[(nmod:arg, 40)]",,2:LVC.full
4805,294,5,samookreśleniem,samookreślenie,NOUN,subst:sg:inst:n,"{'Case': 'Ins', 'Gender': 'Neut', 'Number': 'S...",3,nmod,,,2:IRV
4847,297,22,pochłonął,pochłonąć,VERB,praet:sg:m3:perf,"{'Animacy': 'Inan', 'Aspect': 'Perf', 'Gender'...",16,acl:relcl,"[(acl:relcl, 16)]",,2:LVC.full
...,...,...,...,...,...,...,...,...,...,...,...,...
71629,4263,24,mamy,mieć,VERB,fin:pl:pri:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",12,ccomp,"[(ccomp, 12)]",,2:VID
72645,4323,17,wchodząca,wchodzić,ADJ,pact:sg:nom:f:imperf:aff,"{'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Fe...",21,acl,,,2:VID
73288,4366,19,mają,mieć,VERB,fin:pl:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",3,conj,,,2:LVC.full
73645,4384,28,chlubi,chlubić,VERB,fin:sg:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",25,parataxis:insert,"[(parataxis:insert, 25)]",,2:IRV


<h4>Read train data CSV</h4>

In [3]:
# read test dataframe from TSV file
df_train = pd.read_csv(os.path.join(dir_path, 'train_df.tsv'), sep='\t')

df_train

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
0,0,1,Większym,duży,ADJ,adj:sg:inst:m3:com,"{'Animacy': 'Inan', 'Case': 'Ins', 'Degree': '...",2.0,amod,,,*
1,0,2,problemem,problem,NOUN,subst:sg:inst:m3,"{'Animacy': 'Inan', 'Case': 'Ins', 'Gender': '...",0.0,root,,,*
2,0,3,będzie,być,AUX,bedzie:sg:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",2.0,cop,,,*
3,0,4,znalezienie,znaleźć,NOUN,ger:sg:nom:n:perf:aff,"{'Aspect': 'Perf', 'Case': 'Nom', 'Gender': 'N...",2.0,nsubj,,,*
4,0,5,zawodnika,zawodnik,PROPN,subst:sg:gen:m1,"{'Animacy': 'Hum', 'Case': 'Gen', 'Gender': 'M...",4.0,obj,,{'SpaceAfter': 'No'},*
...,...,...,...,...,...,...,...,...,...,...,...,...
298432,17730,1,Gorąco,gorąco,ADV,adv:pos,{'Degree': 'Pos'},3.0,advmod,,,*
298433,17730,2,więc,więc,CCONJ,conj,,1.0,fixed,,,*
298434,17730,3,polecam,polecać,VERB,fin:sg:pri:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",0.0,root,,,*
298435,17730,4,kartkę,kartka,NOUN,subst:sg:acc:f,"{'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Si...",3.0,obj,,{'SpaceAfter': 'No'},*


In [16]:
# get DataFrame and count of potential incorrect MWEs
# df_non_mwe = df_test[df_test['parseme:mwe'] == '*']


# PoS types, wchich appear at least 50 times in train set
# verb + pron (zaimek)
# verb + noun
# adj + noun
# pron + verb
# verb + adp (przyimek)
# adj + pron
# noun + verb
# noun + noun
# noun + adj
# adp + noun
# noun + pron
df = pd.read_csv(os.path.join('..', 'storage', 'parseme', 'pl', 'preprocessed_data', 'test', 'test_df.tsv'), sep='\t')
correct_mwe_list = pd.read_csv(os.path.join('..', 'storage', 'parseme', 'pl', 'preprocessed_data', 'test', 'parseme_test_correct_mwes.tsv'), sep='\t')['full_mwe_lemma'].tolist()
incorr_mwe_df = pd.DataFrame(columns=['type', 'first_word', 'first_word_lemma', 'first_word_id',
                                      'second_word', 'second_word_lemma', 'second_word_id',
                                      'full_mwe', 'full_mwe_lemma', 'sentence'])

correct_pos = ['VERB+PRON', 'VERB+NOUN', 'ADJ+NOUN', 'VERB+ADP', 'ADJ+PRON', 
               'NOUN+VERB', 'NOUN+NOUN', 'NOUN+ADJ', 'ADP+NOUN', 'NOUN+PRON']

mwe_count = 0
mwe_count_dict = {}

for sent_id in df['sent_id'].unique():
    pos_list = df[df['sent_id'] == sent_id]['upos'].tolist()
    idx_list = df[df['sent_id'] == sent_id]['id'].tolist()
    form_list = df[df['sent_id'] == sent_id]['form'].tolist()
    lemma_list = df[df['sent_id'] == sent_id]['lemma'].tolist()
    mwe_tag_list = df[df['sent_id'] == sent_id]['parseme:mwe'].tolist()
    deprel_list = df[df['sent_id'] == sent_id]['deprel'].tolist()
    deps_list = [ast.literal_eval(elem) if type(elem) == str else [] for elem in df[df['sent_id'] == sent_id]['deps'].tolist()]
    sentence = ' '.join([str(word) for word in form_list])
    # print(f'Ids of words: {df[df["sent_id"] == sent_id]["id"].tolist()}')
    
    for pos_ind, pos in enumerate(pos_list[:-1]):
        first_pos = pos
        second_pos = pos_list[pos_ind + 1]
        mwe_pos = f'{first_pos}+{second_pos}'
        
        first_word_deps = [elem[1] for elem in deps_list[pos_ind]]
        second_word_deps = [elem[1] for elem in deps_list[pos_ind + 1]]
        
        if (mwe_pos in correct_pos and 
            mwe_tag_list[pos_ind] == '*' and 
            mwe_tag_list[pos_ind + 1 ] == '*' and 
            (int(idx_list[pos_ind]) in second_word_deps or int(idx_list[pos_ind + 1]) in first_word_deps) and
            ' '.join([lemma_list[pos_ind], lemma_list[pos_ind + 1]]) not in correct_mwe_list):
            mwe_count += 1
            
            # print every 1000th MWE
            # if mwe_count % 1000 == 0:
            #     print(f"{mwe_pos}\t{idx_list[pos_ind]}\t{form_list[pos_ind]}\t{deps_list[pos_ind]}\t{idx_list[pos_ind+1]}\t{form_list[pos_ind + 1]}\t{deps_list[pos_ind+1]}")
            
            # add MWE count to the dictionary
            if mwe_pos not in mwe_count_dict.keys():
                mwe_count_dict[mwe_pos] = 1
                
            else:
                mwe_count_dict[mwe_pos] += 1
                
            # append MWE to DataFrame
            incorr_mwe_df = incorr_mwe_df.append({'type': mwe_pos, 
                                                  'first_word': form_list[pos_ind], 
                                                  'first_word_lemma': lemma_list[pos_ind],
                                                  'first_word_id': pos_ind,
                                                  'second_word': form_list[pos_ind + 1],
                                                  'second_word_lemma': lemma_list[pos_ind + 1],
                                                  'second_word_id': int(pos_ind) + 1,
                                                  'full_mwe': str(form_list[pos_ind]) + ' ' + str(form_list[pos_ind + 1]),
                                                  'full_mwe_lemma': str(lemma_list[pos_ind]) + ' ' + str(lemma_list[pos_ind + 1]),
                                                  'sentence': sentence}, 
                                                 ignore_index=True)

# save DataFrame to TSV
incorr_mwe_df.to_csv(os.path.join('..', 'storage', 'parseme', 'pl', 'preprocessed_data', 'test', 'parseme_test_incorrect_mwes.tsv'), sep='\t', index=False)
            
print(f'MWE count: {mwe_count}')
mwe_count_dict

MWE count: 7114


{'ADJ+NOUN': 1575,
 'NOUN+NOUN': 1112,
 'NOUN+ADJ': 825,
 'ADP+NOUN': 1920,
 'VERB+NOUN': 673,
 'NOUN+VERB': 525,
 'VERB+PRON': 445,
 'VERB+ADP': 10,
 'ADJ+PRON': 19,
 'NOUN+PRON': 10}

In [17]:
incorr_mwe_df = pd.read_csv(os.path.join('..', 'storage', 'parseme', 'pl', 'preprocessed_data', 'test', 'parseme_test_incorrect_mwes.tsv'), sep='\t')
incorr_mwe_df

Unnamed: 0,type,first_word,first_word_lemma,first_word_id,second_word,second_word_lemma,second_word_id,full_mwe,full_mwe_lemma,sentence
0,ADJ+NOUN,Głównym,główny,0,zadaniem,zadanie,1,Głównym zadaniem,główny zadanie,Głównym zadaniem Uniwersytetu Nanzan jest więc...
1,NOUN+NOUN,zadaniem,zadanie,1,Uniwersytetu,uniwersytet,2,zadaniem Uniwersytetu,zadanie uniwersytet,Głównym zadaniem Uniwersytetu Nanzan jest więc...
2,ADJ+NOUN,maksymalnej,maksymalny,10,liczby,liczba,11,maksymalnej liczby,maksymalny liczba,Głównym zadaniem Uniwersytetu Nanzan jest więc...
3,ADJ+NOUN,całej,cały,20,kultury,kultura,21,całej kultury,cały kultura,Głównym zadaniem Uniwersytetu Nanzan jest więc...
4,NOUN+ADJ,kultury,kultura,21,japońskiej,japoński,22,kultury japońskiej,kultura japoński,Głównym zadaniem Uniwersytetu Nanzan jest więc...
...,...,...,...,...,...,...,...,...,...,...
7109,NOUN+ADJ,Książek,książka,10,Zakazanych,zakazany,11,Książek Zakazanych,książka zakazany,"Jedyną pozostałością z czasów , kiedy wydawany..."
7110,NOUN+ADJ,rubryka,rubryka,21,oceniająca,oceniać,22,rubryka oceniająca,rubryka oceniać,"Jedyną pozostałością z czasów , kiedy wydawany..."
7111,ADJ+NOUN,oceniająca,oceniać,22,filmy,film,23,oceniająca filmy,oceniać film,"Jedyną pozostałością z czasów , kiedy wydawany..."
7112,NOUN+ADJ,filmy,film,23,wyświetlane,wyświetlać,24,filmy wyświetlane,film wyświetlać,"Jedyną pozostałością z czasów , kiedy wydawany..."


In [19]:
[ast.literal_eval(elem) for elem in df_train[df_train['sent_id'] == 5]['deps'].tolist()]

[[('root', 0)],
 [('advmod:emph', 1)],
 [('punct', 11)],
 [('mark', 11)],
 [('case', 6)],
 [('obl', 11)],
 [('case', 8)],
 [('nmod', 6)],
 [('nsubj', 11)],
 [('nmod', 9)],
 [('ccomp', 1)],
 [('xcomp', 11)],
 [('obj', 12)],
 [('case', 16), ('case', 18)],
 [('amod:flat', 16)],
 [('obl', 12)],
 [('cc', 18)],
 [('obl', 12), ('conj', 16)],
 [('punct', 29)],
 [('advmod', 29)],
 [('punct', 23)],
 [('nmod:poss', 23)],
 [('parataxis:insert', 29)],
 [('punct', 23)],
 [('nsubj', 29)],
 [('cop', 29)],
 [('case', 29)],
 [('fixed', 27)],
 [('acl:relcl', 16), ('acl:relcl', 18)],
 [('case', 31)],
 [('obl', 29)],
 [('amod', 31)],
 [('punct', 1)]]

In [5]:
# get number of MWE instances
df_train[(df_train['parseme:mwe'] != '*') & (df_train['parseme:mwe'].str.len() > 1)]

Unnamed: 0,sent_id,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,parseme:mwe
177,8,11,wpadam,wpadać,VERB,fin:sg:pri:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",0.0,root,"[('root', 0)]",,1:LVC.full
184,9,3,znajduje,znajdować,VERB,fin:sg:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",0.0,root,,,1:IRV
301,14,39,pełniła,pełnić,VERB,praet:sg:f:imperf,"{'Aspect': 'Imp', 'Gender': 'Fem', 'Mood': 'In...",13.0,conj,"[('conj', 5), ('conj', 13)]",,1:LVC.full
340,16,14,trzymająca,trzymać,ADJ,pact:sg:nom:f:imperf:aff,"{'Aspect': 'Imp', 'Case': 'Nom', 'Gender': 'Fe...",13.0,acl,"[('acl', 13)]",,1:LVC.full
404,18,54,się,się,PRON,part,"{'PronType': 'Prs', 'Reflex': 'Yes'}",55.0,expl:pv,"[('expl:pv', 55)]",,1:IRV
...,...,...,...,...,...,...,...,...,...,...,...,...
298272,17718,9,czują,czuć,VERB,fin:pl:ter:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",1.0,conj,"[('root', 0), ('conj', 1)]",,1:IRV
298285,17719,5,ankiety,ankieta,NOUN,subst:sg:gen:f,"{'Case': 'Gen', 'Gender': 'Fem', 'Number': 'Si...",4.0,nmod,,,1:LVC.full
298298,17720,2,Widzę,widzieć,VERB,fin:sg:pri:imperf,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Si...",0.0,root,,,1:VID
298337,17723,9,starała,starać,VERB,praet:sg:f:imperf,"{'Aspect': 'Imp', 'Gender': 'Fem', 'Mood': 'In...",6.0,ccomp:cleft,"[('ccomp:cleft', 6)]",,1:IRV


In [6]:
# check if there are any MWEs without VERB
df_train[(df_train['parseme:mwe'] != '*')]
nonverb_mwe_count = 0

for sent_id in df_train['sent_id'].unique():
    pos_list = df_train[df_train['sent_id'] == sent_id]['upos'].tolist()
    mwe_tag_list = df_train[df_train['sent_id'] == sent_id]['parseme:mwe'].tolist()
    
    for pos_ind, pos in enumerate(pos_list[:-1]):
        if (pos != 'VERB' and pos_list[pos_ind + 1] != 'VERB') and mwe_tag_list[pos_ind] != '*' and mwe_tag_list[pos_ind + 1] != '*':
            nonverb_mwe_count += 1
            
nonverb_mwe_count

1026

In [9]:
# get dictionary and dataframe containing PoS tag counts for CORRECT MWEs
def get_dict_key(mwe_dict, mwe_type_id, last_key=False):
    mwe_type_count = len([key for key in mwe_dict.keys() if key[0] == mwe_type_id])

    if last_key and mwe_type_count > 0:
        mwe_type_count -= 1
        
    return f'{mwe_type_id}-{mwe_type_count}'

df_train = pd.read_csv(os.path.join('..', 'storage', 'parseme', 'pl', 'preprocessed_data', 'test', 'test_df.tsv'), sep='\t')
df_correct_mwe = pd.DataFrame(columns=['type', 'first_word', 'first_word_lemma', 'first_word_id',
                                      'second_word', 'second_word_lemma', 'second_word_id',
                                      'full_mwe', 'full_mwe_lemma', 'sentence'])
mwe_count_dict = {}

for sent_id in df_train['sent_id'].unique():
    pos_list = df_train[df_train['sent_id'] == sent_id]['upos'].tolist()
    mwe_tag_list = df_train[df_train['sent_id'] == sent_id]['parseme:mwe'].tolist()
    idx_list = df_train[df_train['sent_id'] == sent_id]['id'].tolist()
    form_list = df_train[df_train['sent_id'] == sent_id]['form'].tolist()
    lemma_list = df_train[df_train['sent_id'] == sent_id]['lemma'].tolist()
    sentence = ' '.join([str(word) for word in form_list])
    # mwe_component_ind = [False for _ in range(len(pos_list))]
    
#     for pos_ind, pos in enumerate(pos_list[:-1]):
#         if mwe_component_ind[pos_ind]:
#             continue

#         if ((pos_ind == 0 or mwe_tag_list[pos_ind - 1] == '*') and
#             mwe_tag_list[pos_ind] != '*' and
#             mwe_tag_list[pos_ind + 1] != '*' and
#             (pos_ind + 1 == len(pos_list) - 1 or mwe_tag_list[pos_ind + 2] == '*')):
            
    mwe = ''
    mwe_pos = ''
    mwe_ind_list = []
    mwe_dict = {}
    for ind, mwe_tag in enumerate(mwe_tag_list):
        
        # sprawdzaj po kolei tagi tak jak w przypadku MWE dłuższych niż 2 - jeżeli trafisz na * to nie dodajesz go do MWE, 
        # ale dodajesz składowe do MWE aż nie trafisz na token dłuższy niż 1 znak
        # checks MWE tags until it gets tag longer than 1 char - if it gets it then it is considered as the new MWE
        # if not longer than 1 char then if it is equal to * then it is not added to the MWE components list
        # if it is not equal to * then add it as the next component of MWE
        
        if len(mwe_tag) == 1 and mwe_tag != '*':
            mwe_dict[get_dict_key(mwe_dict, mwe_tag, last_key=True)][0] += f' {form_list[ind].lower()}'
            mwe_dict[get_dict_key(mwe_dict, mwe_tag, last_key=True)][1] += f'+{pos_list[ind]}'
            mwe_dict[get_dict_key(mwe_dict, mwe_tag, last_key=True)][2].append(ind)

        if len(mwe_tag) != 1:
            for related_mwe in  mwe_tag.split(';'):
                if len(related_mwe) == 1:
                    mwe_dict[get_dict_key(mwe_dict, related_mwe, last_key=True)][0] += f' {form_list[ind].lower()}'
                    mwe_dict[get_dict_key(mwe_dict, related_mwe, last_key=True)][1] += f'+{pos_list[ind]}'
                    mwe_dict[get_dict_key(mwe_dict, related_mwe, last_key=True)][2].append(ind)
                
                if len(related_mwe) > 1:
                    new_mwe_key = get_dict_key(mwe_dict, related_mwe[0], last_key=False)

                    mwe = form_list[ind]
                    mwe_pos = pos_list[ind]
                    mwe_ind_list = [ind]

                    mwe_dict[new_mwe_key] = [mwe, mwe_pos, mwe_ind_list]
            
            
        if ind == len(mwe_tag_list) - 1:
            
            for mwe_key in sorted(mwe_dict.keys(), key=lambda x:x.lower()):
                
                mwe = mwe_dict[mwe_key][0]
                mwe_pos = mwe_dict[mwe_key][1]
                mwe_ind_list = mwe_dict[mwe_key][2]
                
                if len(mwe_ind_list) == 2 and int(mwe_ind_list[1]) - int(mwe_ind_list[0]) == 1:
                    df_correct_mwe = df_correct_mwe.append({'type': mwe_pos, 
                                                          'first_word': form_list[mwe_ind_list[0]], 
                                                          'first_word_lemma': lemma_list[mwe_ind_list[0]],
                                                          'first_word_id': mwe_ind_list[0],
                                                          'second_word': form_list[mwe_ind_list[1]],
                                                          'second_word_lemma': lemma_list[mwe_ind_list[1]],
                                                          'second_word_id': int(mwe_ind_list[1]),
                                                          'full_mwe': str(form_list[mwe_ind_list[0]]) + ' ' + str(form_list[mwe_ind_list[1]]),
                                                          'full_mwe_lemma': str(lemma_list[mwe_ind_list[0]]) + ' ' + str(lemma_list[mwe_ind_list[1]]),
                                                          'sentence': sentence}, 
                                                         ignore_index=True)

                    if mwe_pos not in mwe_count_dict.keys():
                        mwe_count_dict[mwe_pos] = 1

                    else:
                        mwe_count_dict[mwe_pos] += 1
            
# save DataFrame to TSV
# df_correct_mwe.to_csv('parseme_correct_mwes.tsv', sep='\t', index=False)
            
mwe_count_dict
df_correct_mwe

Unnamed: 0,type,first_word,first_word_lemma,first_word_id,second_word,second_word_lemma,second_word_id,full_mwe,full_mwe_lemma,sentence
0,VERB+PRON,cieszą,cieszyć,13,się,się,14,cieszą się,cieszyć się,Zagospodarowane zostaną tereny w pobliżu pomni...
1,VERB+PRON,uśmiechał,uśmiechać,26,się,się,27,uśmiechał się,uśmiechać się,"Tamci podchwycili okrzyk , przestali tupać , p..."
2,VERB+NOUN,brać,brać,9,kredytów,kredyt,10,brać kredytów,brać kredyt,"- Ale jak rozwijać produkcję , skoro nie warto..."
3,VERB+NOUN,rozdziera,rozdzierać,11,szaty,szata,12,rozdziera szaty,rozdzierać szata,Jak dotąd lewica nie ma nic konkretnego do zao...
4,VERB+PRON,dzieje,dziać,2,się,się,3,dzieje się,dziać się,W praktyce dzieje się tak w większości spraw r...
...,...,...,...,...,...,...,...,...,...,...
873,ADJ+NOUN,mające,mieć,7,szansę,szansa,8,mające szansę,mieć szansa,- To powinny być przecież ostatnie osoby mając...
874,VERB+PRON,wydał,wydać,4,się,się,5,wydał się,wydać się,"Zatrzymali samochód , bo wydał się podejrzany ."
875,VERB+NOUN,miały,mieć,6,problemów,problem,7,miały problemów,mieć problem,Jednak wszystkie te ugrupowania nie będą miały...
876,VERB+PRON,nadaje,nadawać,15,się,się,16,nadaje się,nadawać się,"Słusznie , bowiem nawet najwięksi apologeci ni..."


In [10]:
df_correct_mwe.to_csv(os.path.join('..', 'storage', 'parseme', 'pl', 'preprocessed_data', 'test', 'parseme_test_correct_mwes.tsv'), sep='\t', index=False)

In [10]:
correct_mwe_count = sum([elem for elem in mwe_count_dict.values()])

correct_mwe_count

4064

In [6]:
len(incorr_mwe_df['full_mwe'].unique().tolist())

25758

In [7]:
# get dictionary containing PoS tag counts for INCORRECT MWEs
incorr_mwe_count_dict = {}

for pos_tag in incorr_mwe_df['type']:
    if pos_tag not in incorr_mwe_count_dict.keys():
        incorr_mwe_count_dict[pos_tag] = 1
    else:
        incorr_mwe_count_dict[pos_tag] += 1
        
incorr_mwe_count_dict

{'VERB+PRON': 1907,
 'ADP+NOUN': 7968,
 'NOUN+NOUN': 4485,
 'ADJ+NOUN': 6152,
 'NOUN+ADJ': 3407,
 'VERB+NOUN': 2647,
 'NOUN+VERB': 2260,
 'VERB+ADP': 34,
 'NOUN+PRON': 77,
 'ADJ+PRON': 90}

In [19]:
incorrect_mwe_count = sum([1500 if elem > 1500 else elem for elem in incorr_mwe_count_dict.values()])

incorrect_mwe_count

10701

In [17]:
undersampled_mwe_dict = {elem: count if count < 1500 else 1500 for elem, count in incorr_mwe_count_dict.items()}
undersampled_mwe_dict

{'VERB+PRON': 1500,
 'ADP+NOUN': 1500,
 'NOUN+NOUN': 1500,
 'ADJ+NOUN': 1500,
 'NOUN+ADJ': 1500,
 'VERB+NOUN': 1500,
 'NOUN+VERB': 1500,
 'VERB+ADP': 34,
 'NOUN+PRON': 77,
 'ADJ+PRON': 90}