In [22]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from collections import defaultdict
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

In [29]:
mt5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
#mt5_model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
mt5_vocab_set = set(mt5_tokenizer.get_vocab())



In [34]:
# Parse morphynet into dict of lang -> all morphemes:
MORPHYNET_PATH = "/mmfs1/gscratch/ark/knylund/MorphyNet"
lang_to_morphemes = defaultdict(dict)
for lang in tqdm(os.listdir(MORPHYNET_PATH)):
    der_path = f"{MORPHYNET_PATH}/{lang}/{lang}.derivational.v1.tsv"
    inf_path = f"{MORPHYNET_PATH}/{lang}/{lang}.inflectional.v1.tsv"
    if os.path.exists(der_path):
        lang_der_morphynet = pd.read_csv(der_path, sep="\t", names=["source_word", "target_word", "source_POS",
                                                                    "target_POS", "morpheme", "morpheme_type"])
        lang_der_morphemes = set(lang_der_morphynet["morpheme"])
        lang_to_morphemes[lang]["derivational"] = lang_der_morphemes
    if os.path.exists(inf_path):
        lang_inf_morphynet = pd.read_csv(inf_path, sep="\t", names=["lemma", "inflected_word",
                                                                    "morpheme_features", "morpheme_segmentation"])
        lang_inf_morphemes = set()
        def update_inf_morphemes(row):
            new_morphemes = set(str(row["morpheme_segmentation"]).split("|"))
            new_morphemes.discard(row["lemma"])
            lang_inf_morphemes.update(new_morphemes)
        lang_inf_morphynet.apply(update_inf_morphemes, axis=1)
        lang_to_morphemes[lang]["inflectional"] = lang_inf_morphemes

np.save("./lang_to_morpheme_sets", lang_to_morphemes)

100%|██████████| 17/17 [00:39<00:00,  2.32s/it]


In [35]:
for lang, inf_der_morphemes in lang_to_morphemes.items():
    print(f"Language: {lang}")
    if "inflectional" in inf_der_morphemes:
        inf_morphemes = inf_der_morphemes["inflectional"]
        print(f"Inflectional morphemes in vocab: {len(mt5_vocab_set.intersection(inf_morphemes))}/{len(inf_morphemes)}")
    if "derivational" in inf_der_morphemes:
        der_morphemes = inf_der_morphemes["derivational"]
        print(f"Derivational morphemes in vocab: {len(mt5_vocab_set.intersection(der_morphemes))}/{len(der_morphemes)}")

Language: deu
Inflectional morphemes in vocab: 763/13365
Derivational morphemes in vocab: 321/499
Language: pol
Derivational morphemes in vocab: 293/440
Language: hun
Derivational morphemes in vocab: 424/658
Language: hbs
Derivational morphemes in vocab: 261/356
Language: swe
Inflectional morphemes in vocab: 36/716
Derivational morphemes in vocab: 236/321
Language: spa
Derivational morphemes in vocab: 404/685
Language: cat
Inflectional morphemes in vocab: 43/61
Derivational morphemes in vocab: 183/237
Language: por
Derivational morphemes in vocab: 274/376
Language: ces
Inflectional morphemes in vocab: 82/134
Derivational morphemes in vocab: 261/349
Language: ita
Inflectional morphemes in vocab: 66/104
Derivational morphemes in vocab: 436/913
Language: fin
Inflectional morphemes in vocab: 259/7312
Derivational morphemes in vocab: 228/425
Language: eng
Inflectional morphemes in vocab: 8/8
Derivational morphemes in vocab: 853/2254
Language: mon
Inflectional morphemes in vocab: 70/93
Deriv