In [9]:
import os
import string

import morfeusz2
import pandas as pd

from typing import List

<h4>Load MWE lists</h4>

In [2]:
parseme_correct_mwe_filepath = os.path.join('PARSEME_1.2_Polish_Dataset', 'PL', 'train_df.tsv')
parseme_incorrect_mwe_filepath = 'parseme_incorrect_mwes.tsv'

kgr10_correct_mwe_filepath = 'correct_mwe.tsv'
kgr10_incorrect_mwe_filepath = 'incorrect_MWE_kompozycyjne_polaczenia_plWN.csv'

In [3]:
parseme_train_correct_df = pd.read_csv(parseme_correct_mwe_filepath, sep='\t')
parseme_train_correct_df = parseme_train_correct_df[parseme_train_correct_df['parseme:mwe'] != '*']
parseme_train_correct_mwe_list = []
parseme_train_correct_mwe_list_lemmatized = []

first_word_id = 0
curr_word_id = 0
curr_sent_id = 0
mwe_words_list = []
mwe_lemmas_list = []

for ind, row in parseme_train_correct_df.iterrows():
    if len(row['parseme:mwe']) > 1:
        if len(mwe_words_list) > 1:
            parseme_train_correct_mwe_list.append(' '.join(mwe_words_list))
            parseme_train_correct_mwe_list_lemmatized.append(' '.join(mwe_lemmas_list))
        
        first_word_id = int(row['id'])
        curr_word_id = int(row['id'])
        curr_sent_id = int(row['sent_id'])

        mwe_words_list = [row['form']]
        mwe_lemmas_list = [row['lemma']]
        
    if len(row['parseme:mwe']) == 1:
        if int(row['sent_id']) == curr_sent_id and int(row['id']) == curr_word_id + 1:
            curr_word_id = int(row['id'])
            curr_sent_id = int(row['sent_id'])
        
            mwe_words_list.append(row['form'])
            mwe_lemmas_list.append(row['lemma'])

parseme_train_incorrect_mwe_df = pd.read_csv(parseme_incorrect_mwe_filepath, sep='\t')
parseme_train_incorrect_mwe_list = [' '.join([str(first_word).lower(), str(second_word).lower()]) for first_word, second_word in zip(parseme_train_incorrect_mwe_df['first_word'].tolist(), parseme_train_incorrect_mwe_df['second_word'].tolist())]

kgr10_correct_mwe_df = pd.read_csv(kgr10_correct_mwe_filepath, sep='\t')
kgr10_correct_mwe_list = [mwe.lower() for mwe in kgr10_correct_mwe_df['Lemma'].tolist()]

kgr10_incorrect_mwe_df = pd.read_csv(kgr10_incorrect_mwe_filepath, sep=',', on_bad_lines='skip')
kgr10_incorrect_mwe_list = [mwe.lower() for mwe in kgr10_incorrect_mwe_df['lemma'].tolist()]

print(f'PARSEME correct MWE: {len(parseme_train_correct_mwe_list)}',
      f'PARSEME incorrect MWE: {len(parseme_train_incorrect_mwe_list)}',
      f'KGR10 correct MWE: {len(kgr10_correct_mwe_list)}',
      f'KGR10 incorrect MWE: {len(kgr10_incorrect_mwe_list)}',
      sep='\n')

PARSEME correct MWE: 3859
PARSEME incorrect MWE: 29027
KGR10 correct MWE: 53978
KGR10 incorrect MWE: 5249


<h4>Compare MWE lists</h4>

In [4]:
# compare without lemmatization
kgr10_corr_parseme_corr_intersect = set(kgr10_correct_mwe_list).intersection(set(parseme_train_correct_mwe_list))
kgr10_corr_parseme_incorr_intersect = set(kgr10_correct_mwe_list).intersection(set(parseme_train_incorrect_mwe_list))

kgr10_incorr_parseme_corr_intersect = set(kgr10_incorrect_mwe_list).intersection(set(parseme_train_correct_mwe_list))
kgr10_incorr_parseme_incorr_intersect = set(kgr10_incorrect_mwe_list).intersection(set(parseme_train_incorrect_mwe_list))

print(f'KGR10 correct intersection PARSEME correct: {len(kgr10_corr_parseme_corr_intersect)}',
      f'KGR10 correct intersection PARSEME incorrect: {len(kgr10_corr_parseme_incorr_intersect)}',
      f'KGR10 incorrect intersection PARSEME correct: {len(kgr10_incorr_parseme_corr_intersect)}',
      f'KGR10 incorrect intersection PARSEME incorrect: {len(kgr10_incorr_parseme_incorr_intersect)}',
      sep='\n')

KGR10 correct intersection PARSEME correct: 127
KGR10 correct intersection PARSEME incorrect: 408
KGR10 incorrect intersection PARSEME correct: 5
KGR10 incorrect intersection PARSEME incorrect: 41


In [7]:
# init Morfeusz2 lemmatizer
def init_lemmatizer():
    return morfeusz2.Morfeusz()  # initialize Morfeusz object


# lemmatize MWEs
def lemmatize_mwe(mwe_list, lemmatizer) -> List[str]:
    lemmatized_mwe_list = ['*' * 200 for _ in range(len(mwe_list))]

    for i, mwe in enumerate(mwe_list):
        mwe_words = [token for token in mwe.split(' ')]
        lemmatized_mwe_list[i] = ' '.join(
            [str(lemmatizer.analyse(word)[0][2][1]) if word not in string.punctuation else word for word in mwe_words])

    return lemmatized_mwe_list

In [10]:
# compare with lemmatization
lemmatizer = init_lemmatizer()

parseme_train_incorrect_mwe_list_lemmatized = lemmatize_mwe(parseme_train_incorrect_mwe_list, lemmatizer)
kgr10_correct_mwe_list_lemmatized = lemmatize_mwe(kgr10_correct_mwe_list, lemmatizer)
kgr10_incorrect_mwe_list_lemmatized = lemmatize_mwe(kgr10_incorrect_mwe_list, lemmatizer)

kgr10_corr_parseme_corr_intersect = set(kgr10_correct_mwe_list_lemmatized).intersection(set(parseme_train_correct_mwe_list_lemmatized))
kgr10_corr_parseme_incorr_intersect = set(kgr10_correct_mwe_list_lemmatized).intersection(set(parseme_train_incorrect_mwe_list_lemmatized))

kgr10_incorr_parseme_corr_intersect = set(kgr10_incorrect_mwe_list_lemmatized).intersection(set(parseme_train_correct_mwe_list_lemmatized))
kgr10_incorr_parseme_incorr_intersect = set(kgr10_incorrect_mwe_list_lemmatized).intersection(set(parseme_train_incorrect_mwe_list_lemmatized))

print(f'KGR10 correct intersection PARSEME correct: {len(kgr10_corr_parseme_corr_intersect)}',
      f'KGR10 correct intersection PARSEME incorrect: {len(kgr10_corr_parseme_incorr_intersect)}',
      f'KGR10 incorrect intersection PARSEME correct: {len(kgr10_incorr_parseme_corr_intersect)}',
      f'KGR10 incorrect intersection PARSEME incorrect: {len(kgr10_incorr_parseme_incorr_intersect)}',
      sep='\n')

KGR10 correct intersection PARSEME correct: 439
KGR10 correct intersection PARSEME incorrect: 1238
KGR10 incorrect intersection PARSEME correct: 31
KGR10 incorrect intersection PARSEME incorrect: 137
