In [1]:
import ast
import os
import string

import morfeusz2
import pandas as pd

from typing import List

<h4>Load MWE lists</h4>

In [2]:
parseme_correct_mwe_filepath = os.path.join('PARSEME_1.2_Polish_Dataset', 'PL', 'train_df.tsv')
parseme_incorrect_mwe_filepath = 'parseme_incorrect_mwes.tsv'

kgr10_correct_mwe_filepath = 'correct_mwe.tsv'
kgr10_incorrect_mwe_filepath = 'incorrect_MWE_kompozycyjne_polaczenia_plWN.csv'

In [3]:
parseme_train_correct_df = pd.read_csv(parseme_correct_mwe_filepath, sep='\t')
parseme_train_correct_df = parseme_train_correct_df[parseme_train_correct_df['parseme:mwe'] != '*']
parseme_train_correct_mwe_list = []
parseme_train_correct_mwe_list_lemmatized = []

first_word_id = 0
curr_word_id = 0
curr_sent_id = 0
mwe_words_list = []
mwe_lemmas_list = []

for ind, row in parseme_train_correct_df.iterrows():
    if len(row['parseme:mwe']) > 1:
        if len(mwe_words_list) > 1:
            parseme_train_correct_mwe_list.append(' '.join(mwe_words_list))
            parseme_train_correct_mwe_list_lemmatized.append(' '.join(mwe_lemmas_list))
        
        first_word_id = int(row['id'])
        curr_word_id = int(row['id'])
        curr_sent_id = int(row['sent_id'])

        mwe_words_list = [row['form']]
        mwe_lemmas_list = [row['lemma']]
        
    if len(row['parseme:mwe']) == 1:
        if int(row['sent_id']) == curr_sent_id and int(row['id']) == curr_word_id + 1:
            curr_word_id = int(row['id'])
            curr_sent_id = int(row['sent_id'])
        
            mwe_words_list.append(row['form'])
            mwe_lemmas_list.append(row['lemma'])

parseme_train_incorrect_mwe_df = pd.read_csv(parseme_incorrect_mwe_filepath, sep='\t')
parseme_train_incorrect_mwe_list = [' '.join([str(first_word).lower(), str(second_word).lower()]) for first_word, second_word in zip(parseme_train_incorrect_mwe_df['first_word'].tolist(), parseme_train_incorrect_mwe_df['second_word'].tolist())]

kgr10_correct_mwe_df = pd.read_csv(kgr10_correct_mwe_filepath, sep='\t')
kgr10_correct_mwe_list = [mwe.lower() for mwe in kgr10_correct_mwe_df['Lemma'].tolist()]

kgr10_incorrect_mwe_df = pd.read_csv(kgr10_incorrect_mwe_filepath, sep=',', on_bad_lines='skip')
kgr10_incorrect_mwe_list = [mwe.lower() for mwe in kgr10_incorrect_mwe_df['lemma'].tolist()]

print(f'PARSEME correct MWE: {len(parseme_train_correct_mwe_list)}',
      f'PARSEME incorrect MWE: {len(parseme_train_incorrect_mwe_list)}',
      f'KGR10 correct MWE: {len(kgr10_correct_mwe_list)}',
      f'KGR10 incorrect MWE: {len(kgr10_incorrect_mwe_list)}',
      sep='\n')

PARSEME correct MWE: 3859
PARSEME incorrect MWE: 29027
KGR10 correct MWE: 53978
KGR10 incorrect MWE: 5249


<h4>Compare MWE lists</h4>

In [4]:
# compare without lemmatization
kgr10_corr_parseme_corr_intersect = set(kgr10_correct_mwe_list).intersection(set(parseme_train_correct_mwe_list))
kgr10_corr_parseme_incorr_intersect = set(kgr10_correct_mwe_list).intersection(set(parseme_train_incorrect_mwe_list))

kgr10_incorr_parseme_corr_intersect = set(kgr10_incorrect_mwe_list).intersection(set(parseme_train_correct_mwe_list))
kgr10_incorr_parseme_incorr_intersect = set(kgr10_incorrect_mwe_list).intersection(set(parseme_train_incorrect_mwe_list))

print(f'KGR10 correct intersection PARSEME correct: {len(kgr10_corr_parseme_corr_intersect)}',
      f'KGR10 correct intersection PARSEME incorrect: {len(kgr10_corr_parseme_incorr_intersect)}',
      f'KGR10 incorrect intersection PARSEME correct: {len(kgr10_incorr_parseme_corr_intersect)}',
      f'KGR10 incorrect intersection PARSEME incorrect: {len(kgr10_incorr_parseme_incorr_intersect)}',
      sep='\n')
print(len(set(parseme_train_correct_mwe_list)))

KGR10 correct intersection PARSEME correct: 127
KGR10 correct intersection PARSEME incorrect: 408
KGR10 incorrect intersection PARSEME correct: 5
KGR10 incorrect intersection PARSEME incorrect: 41
2608


In [8]:
# compare without lemmatization with checking if MWE is a part of MWE in another list
# kgr10_corr_parseme_corr_intersect = set(kgr10_correct_mwe_list).intersection(set(parseme_train_correct_mwe_list))
kgr10_corr_parseme_corr_intersect = [parseme_mwe for parseme_mwe in set(parseme_train_correct_mwe_list) if parseme_mwe in set(kgr10_correct_mwe_list) or any(parseme_mwe in kgr10_mwe for kgr10_mwe in kgr10_correct_mwe_list)]
# kgr10_corr_parseme_incorr_intersect = set(kgr10_correct_mwe_list).intersection(set(parseme_train_incorrect_mwe_list))
kgr10_corr_parseme_incorr_intersect = [parseme_mwe for parseme_mwe in set(parseme_train_incorrect_mwe_list) if parseme_mwe in set(kgr10_correct_mwe_list) or any(parseme_mwe in kgr10_mwe for kgr10_mwe in kgr10_correct_mwe_list)]

# kgr10_incorr_parseme_corr_intersect = set(kgr10_incorrect_mwe_list).intersection(set(parseme_train_correct_mwe_list))
kgr10_incorr_parseme_corr_intersect = [parseme_mwe for parseme_mwe in set(parseme_train_correct_mwe_list) if parseme_mwe in set(kgr10_incorrect_mwe_list) or any(parseme_mwe in kgr10_mwe for kgr10_mwe in kgr10_incorrect_mwe_list)]
# kgr10_incorr_parseme_incorr_intersect = set(kgr10_incorrect_mwe_list).intersection(set(parseme_train_incorrect_mwe_list))
kgr10_incorr_parseme_incorr_intersect = [parseme_mwe for parseme_mwe in set(parseme_train_incorrect_mwe_list) if parseme_mwe in set(kgr10_incorrect_mwe_list) or any(parseme_mwe in kgr10_mwe for kgr10_mwe in kgr10_incorrect_mwe_list)]

print(f'KGR10 correct intersection PARSEME correct: {len(kgr10_corr_parseme_corr_intersect)}',
      f'KGR10 correct intersection PARSEME incorrect: {len(kgr10_corr_parseme_incorr_intersect)}',
      f'KGR10 incorrect intersection PARSEME correct: {len(kgr10_incorr_parseme_corr_intersect)}',
      f'KGR10 incorrect intersection PARSEME incorrect: {len(kgr10_incorr_parseme_incorr_intersect)}',
      sep='\n')

KGR10 correct intersection PARSEME correct: 138
KGR10 correct intersection PARSEME incorrect: 778
KGR10 incorrect intersection PARSEME correct: 16
KGR10 incorrect intersection PARSEME incorrect: 153


In [13]:
# save lists to CSV
filenames = ['kgr10_corr_parseme_corr_intersect.csv', 'kgr10_corr_parseme_incorr_intersect.csv', 'kgr10_incorr_parseme_corr_intersect.csv', 'kgr10_incorr_parseme_incorr_intersect.csv']
mwe_lists = [kgr10_corr_parseme_corr_intersect, kgr10_corr_parseme_incorr_intersect, kgr10_incorr_parseme_corr_intersect, kgr10_incorr_parseme_incorr_intersect]

for ind, mwe_list in enumerate(mwe_lists):
    with open(filenames[ind], 'w', encoding='utf-8') as out_file:
        out_file.write('\n'.join(mwe_list))

In [None]:
# check how many MWE longer than 2 words are in the datasets
for dataset in [set(kgr10_correct_mwe_list), set(kgr10_incorrect_mwe_list), set(parseme_train_correct_mwe_list), set(parseme_train_incorrect_mwe_list)]:
    longer_mwe_count = 0
    for mwe in dataset:
        if len(mwe.split(' ')) > 2:
            longer_mwe_count += 1
            
    print(f'MWE longer than 2 words: {longer_mwe_count}')

In [5]:
# init Morfeusz2 lemmatizer
def init_lemmatizer():
    return morfeusz2.Morfeusz()  # initialize Morfeusz object


# lemmatize MWEs
def lemmatize_mwe(mwe_list, lemmatizer) -> List[str]:
    lemmatized_mwe_list = ['*' * 200 for _ in range(len(mwe_list))]

    for i, mwe in enumerate(mwe_list):
        mwe_words = [token for token in mwe.split(' ')]
        lemmatized_mwe_list[i] = ' '.join(
            [str(lemmatizer.analyse(word)[0][2][1]) if word not in string.punctuation else word for word in mwe_words])

    return lemmatized_mwe_list

In [6]:
# compare with lemmatization
lemmatizer = init_lemmatizer()

parseme_train_incorrect_mwe_list_lemmatized = lemmatize_mwe(parseme_train_incorrect_mwe_list, lemmatizer)
kgr10_correct_mwe_list_lemmatized = lemmatize_mwe(kgr10_correct_mwe_list, lemmatizer)
kgr10_incorrect_mwe_list_lemmatized = lemmatize_mwe(kgr10_incorrect_mwe_list, lemmatizer)

kgr10_corr_parseme_corr_intersect = set(kgr10_correct_mwe_list_lemmatized).intersection(set(parseme_train_correct_mwe_list_lemmatized))
kgr10_corr_parseme_incorr_intersect = set(kgr10_correct_mwe_list_lemmatized).intersection(set(parseme_train_incorrect_mwe_list_lemmatized))

kgr10_incorr_parseme_corr_intersect = set(kgr10_incorrect_mwe_list_lemmatized).intersection(set(parseme_train_correct_mwe_list_lemmatized))
kgr10_incorr_parseme_incorr_intersect = set(kgr10_incorrect_mwe_list_lemmatized).intersection(set(parseme_train_incorrect_mwe_list_lemmatized))

print(f'KGR10 correct intersection PARSEME correct: {len(kgr10_corr_parseme_corr_intersect)}',
      f'KGR10 correct intersection PARSEME incorrect: {len(kgr10_corr_parseme_incorr_intersect)}',
      f'KGR10 incorrect intersection PARSEME correct: {len(kgr10_incorr_parseme_corr_intersect)}',
      f'KGR10 incorrect intersection PARSEME incorrect: {len(kgr10_incorr_parseme_incorr_intersect)}',
      sep='\n')

KGR10 correct intersection PARSEME correct: 439
KGR10 correct intersection PARSEME incorrect: 1238
KGR10 incorrect intersection PARSEME correct: 31
KGR10 incorrect intersection PARSEME incorrect: 137


In [5]:
# compare MWE lists of PARSEME and get number of sentences with MWEs shared between correct and incorrect MWEs
df_corr = pd.read_csv('parseme_correct_mwes.tsv', sep='\t')
df_incorr = pd.read_csv('parseme_incorrect_mwes.tsv', sep='\t')
mwe_lemma_intersection = set(df_corr['full_mwe_lemma'].tolist()).intersection(set(df_incorr['full_mwe_lemma']))

print(f"No. of correct MWE: {len(df_corr['full_mwe'].unique().tolist())}",
      f"No. of incorrect MWE: {len(df_incorr['full_mwe'].unique().tolist())}",
      f"No. of total MWE: {len(df_corr['full_mwe'].unique().tolist()) + len(df_incorr['full_mwe'].unique().tolist())}",
      sep='\n')

print(f'MWE lemma intersection: {len(mwe_lemma_intersection)}',
      f'No. of sentences in corr dataset with shared MWE: {len(df_corr[df_corr["full_mwe_lemma"].isin(list(mwe_lemma_intersection))])}',
      f'No. of sentences in incorr dataset with shared MWE: {len(df_incorr[df_incorr["full_mwe_lemma"].isin(list(mwe_lemma_intersection))])}',
      f'No. of sentences in total: {len(df_corr) + len(df_incorr)}',
      sep='\n')

print(f'No. of sentences with zdecydować się in corr: {len(df_corr[df_corr["full_mwe_lemma"] == "zdecydować się"])}',
      f'No. of sentences with zdecydować się in incorr: {len(df_incorr[df_incorr["full_mwe_lemma"] == "zdecydować się"])}',
      sep='\n')

# df_corr[df_corr["full_mwe_lemma"] == "nie mieć"].to_csv('parseme_sentences_with_nie_miec.tsv', sep='\t', index=False)

df_intersection = df_corr[df_corr["full_mwe_lemma"].isin(list(mwe_lemma_intersection))].append(df_incorr[df_incorr["full_mwe_lemma"].isin(list(mwe_lemma_intersection))])
df_intersection.to_csv('parseme_correct_incorrect_intersection.tsv', sep='\t', index=False)

No. of correct MWE: 2331
No. of incorrect MWE: 25648
No. of total MWE: 27979
MWE lemma intersection: 0
No. of sentences in corr dataset with shared MWE: 0
No. of sentences in incorr dataset with shared MWE: 0
No. of sentences in total: 32407
No. of sentences with zdecydować się in corr: 26
No. of sentences with zdecydować się in incorr: 0


In [60]:
# get correct MWE occurrences, where they are not tagged as correct
correct_mwe_lemma_list = pd.read_csv('parseme_correct_mwes.tsv', sep='\t')['full_mwe_lemma'].tolist()
df_train = pd.read_csv('PARSEME_1.2_Polish_Dataset/PL/train_df.tsv', sep='\t')

correct_pos = ['VERB+PRON', 'VERB+NOUN', 'ADJ+NOUN', 'VERB+ADP', 'ADJ+PRON', 
               'NOUN+VERB', 'NOUN+NOUN', 'NOUN+ADJ', 'ADP+NOUN', 'NOUN+PRON']

mwe_count = 0
mwe_count_dict = {}
final_df = pd.DataFrame(columns=['type', 'first_word', 'first_word_lemma', 'first_word_id',
                                      'second_word', 'second_word_lemma', 'second_word_id',
                                      'full_mwe', 'full_mwe_lemma', 'sentence'])
longer_mwe_list = []
for sent_id in df_train['sent_id'].unique().tolist():
    pos_list = df_train[df_train['sent_id'] == sent_id]['upos'].tolist()
    idx_list = df_train[df_train['sent_id'] == sent_id]['id'].tolist()
    form_list = df_train[df_train['sent_id'] == sent_id]['form'].tolist()
    lemma_list = df_train[df_train['sent_id'] == sent_id]['lemma'].tolist()
    mwe_tag_list = df_train[df_train['sent_id'] == sent_id]['parseme:mwe'].tolist()
    deprel_list = df_train[df_train['sent_id'] == sent_id]['deprel'].tolist()
    deps_list = [ast.literal_eval(elem) if type(elem) == str else [] for elem in df_train[df_train['sent_id'] == sent_id]['deps'].tolist()]
    sentence = ' '.join([str(word) for word in form_list])
        
    mwe_part_ind = [False for _ in range(len(lemma_list))]
    mwe = ''
    mwe_pos = ''
    for ind, mwe_tag in enumerate(mwe_tag_list[:-1]):
        if mwe_tag == '*':
            if mwe != '' and len(mwe.split(' ')) > 2:
                longer_mwe_list.append((mwe, sentence))
            mwe = ''
            mwe_pos = ''
            
        else:
            if mwe == '':
                mwe = form_list[ind]
                mwe_pos = pos_list[ind]
            else:
                mwe_pos += f'+{pos_list[ind]}'
                mwe += f' {form_list[ind]}'
                
with open('parseme_mwes_longer_than_2.tsv', 'w') as out_file:
    for mwe_tuple in longer_mwe_list:
        out_file.write(f'{mwe_tuple[0]}\t{mwe_tuple[1]}\n')

In [9]:
# search for piece of sentence across all sentence that may contain it in the train set

for sent_id in df_train['sent_id'].unique().tolist():
    pos_list = df_train[df_train['sent_id'] == sent_id]['upos'].tolist()
    idx_list = df_train[df_train['sent_id'] == sent_id]['id'].tolist()
    form_list = df_train[df_train['sent_id'] == sent_id]['form'].tolist()
    lemma_list = df_train[df_train['sent_id'] == sent_id]['lemma'].tolist()
    mwe_tag_list = df_train[df_train['sent_id'] == sent_id]['parseme:mwe'].tolist()
    deprel_list = df_train[df_train['sent_id'] == sent_id]['deprel'].tolist()
    deps_list = [ast.literal_eval(elem) if type(elem) == str else [] for elem in df_train[df_train['sent_id'] == sent_id]['deps'].tolist()]
    sentence = ' '.join([str(word) for word in form_list])
    if 'ankiety przeprowadzonej' in sentence:
        print(f'{[[word, mwe_tag] for word, mwe_tag in zip(form_list, mwe_tag_list)]}')

[['Takie', '*'], ['są', '*'], ['najświeższe', '*'], ['wyniki', '*'], ['ankiety', '1:LVC.full'], ['przeprowadzonej', '1'], ['wśród', '*'], ['bankowców', '*'], ['przez', '*'], ['Pentor', '*'], ['na', '*'], ['zlecenie', '*'], ['Związku', '*'], ['Banków', '*'], ['Polskich', '*'], ['.', '*']]


In [10]:
# count sentences containing untagged correct MWEs
df_train = pd.read_csv('PARSEME_1.2_Polish_Dataset/PL/train_df.tsv', sep='\t')

correct_mwe_lemma_list = pd.read_csv('parseme_correct_mwes.tsv', sep='\t')['full_mwe_lemma'].tolist()

untagged_corr_mwe_df = pd.DataFrame(columns=['type', 'first_word', 'first_word_lemma', 'first_word_id',
                                             'second_word', 'second_word_lemma', 'second_word_id',
                                             'full_mwe', 'full_mwe_lemma', 'sentence'])

for sent_id in df_train['sent_id'].unique().tolist():
    pos_list = df_train[df_train['sent_id'] == sent_id]['upos'].tolist()
    idx_list = df_train[df_train['sent_id'] == sent_id]['id'].tolist()
    form_list = df_train[df_train['sent_id'] == sent_id]['form'].tolist()
    lemma_list = df_train[df_train['sent_id'] == sent_id]['lemma'].tolist()
    mwe_tag_list = df_train[df_train['sent_id'] == sent_id]['parseme:mwe'].tolist()
    deprel_list = df_train[df_train['sent_id'] == sent_id]['deprel'].tolist()
    deps_list = [ast.literal_eval(elem) if type(elem) == str else [] for elem in df_train[df_train['sent_id'] == sent_id]['deps'].tolist()]
    sentence = ' '.join([str(word) for word in form_list])
    
    for lemma_ind, lemma in enumerate(lemma_list[:-1]):
        first_pos = pos_list[lemma_ind]
        second_pos = pos_list[lemma_ind + 1]
        mwe_pos = f'{first_pos}+{second_pos}'
        
        mwe_lemma = f'{str(lemma_list[lemma_ind])} {str(lemma_list[lemma_ind + 1])}'
        
        if (mwe_lemma in correct_mwe_lemma_list and 
            mwe_tag_list[lemma_ind] == '*' and 
            mwe_tag_list[lemma_ind + 1 ] == '*'):
            # append MWE to DataFrame
            untagged_corr_mwe_df = untagged_corr_mwe_df.append({'type': mwe_pos, 
                                                                'first_word': form_list[lemma_ind], 
                                                                'first_word_lemma': lemma_list[lemma_ind],
                                                                'first_word_id': lemma_ind,
                                                                'second_word': form_list[lemma_ind + 1],
                                                                'second_word_lemma': lemma_list[lemma_ind + 1],
                                                                'second_word_id': int(lemma_ind) + 1,
                                                                'full_mwe': str(form_list[lemma_ind]) + ' ' + str(form_list[lemma_ind + 1]),
                                                                'full_mwe_lemma': str(lemma_list[lemma_ind]) + ' ' + str(lemma_list[lemma_ind + 1]),
                                                                'sentence': sentence}, 
                                                               ignore_index=True)
        

print(f'No. of sentences containing untagged correct MWEs: {len(untagged_corr_mwe_df)}',
      f'No. of unique MWEs: {len(untagged_corr_mwe_df["full_mwe"].unique().tolist())}',
      sep='\n')
untagged_corr_mwe_df.to_csv('parseme_untagged_correct_mwe.tsv', sep='\t')

No. of sentences containing untagged correct MWEs: 183
No. of unique MWEs: 131


In [37]:
# count tokens related to correct MWEs and total number of tokens
df_train = pd.read_csv('PARSEME_1.2_Polish_Dataset/PL/train_df.tsv', sep='\t')
print(f'Total tokens: {len(df_train)}',
      f'Tokens tagged as correct: {len(df_train[df_train["parseme:mwe"] != "*"])}',
      sep='\n')

Total tokens: 298437
Tokens tagged as correct: 11379
