In [22]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from collections import defaultdict
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

In [48]:
t5_tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
mt5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
gemma_tokenizer = tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
#mt5_model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")
mt5_vocab_set = set(mt5_tokenizer.get_vocab())
t5_vocab_set = set(t5_tokenizer.get_vocab())
gemma_vocab_set = set(gemma_tokenizer.get_vocab())

In [42]:
# Parse morphynet into dict of lang -> all morphemes:
# Current handling of inflectional morphemes is a bit messed up for deu
# e.g. its' hard to remove babysitt from its segmentation ge-|babysitt|-t because the lemma is babysitten
MORPHYNET_PATH = "/mmfs1/gscratch/ark/knylund/MorphyNet"
lang_to_morphemes = defaultdict(dict)
for lang in tqdm(os.listdir(MORPHYNET_PATH)):
    der_path = f"{MORPHYNET_PATH}/{lang}/{lang}.derivational.v1.tsv"
    inf_path1 = f"{MORPHYNET_PATH}/{lang}/{lang}.inflectional.v1.tsv"
    inf_path2 = f"{MORPHYNET_PATH}/{lang}/{lang}.inflectional.segmentation.v1.tsv"
    if os.path.exists(der_path):
        lang_der_morphynet = pd.read_csv(der_path, sep="\t", names=["source_word", "target_word", "source_POS",
                                                                    "target_POS", "morpheme", "morpheme_type"])
        lang_der_morphemes = set(lang_der_morphynet["morpheme"])
        lang_to_morphemes[lang]["derivational"] = lang_der_morphemes
    inf_path = None
    if os.path.exists(inf_path1):
        inf_path = inf_path1
    elif os.path.exists(inf_path2):
        inf_path = inf_path2
    if inf_path:
        lang_inf_morphynet = pd.read_csv(inf_path, sep="\t", names=["lemma", "inflected_word",
                                                                    "morpheme_features", "morpheme_segmentation"])
        lang_inf_morphemes = set()
        def update_inf_morphemes(row):
            new_morphemes = str(row["morpheme_segmentation"]).split("|")[1:]
            # special segmentation handling since deu has a different format
            #if lang == "deu":
            #    cur_morphemes = set()
            #    for m in new_morphemes:
            #        if "-" in m:
            #            cur_morphemes.add(m.split(" ")[0].replace("-", ""))
            #    new_morphemes = cur_morphemes
            #else:
            new_morphemes = set(m.split(" ")[0].replace("-", "") for m in new_morphemes)
            #new_morphemes.discard(row["lemma"])
            lang_inf_morphemes.update(new_morphemes)
        lang_inf_morphynet.apply(update_inf_morphemes, axis=1)
        lang_to_morphemes[lang]["inflectional"] = lang_inf_morphemes

np.save("./lang_to_morpheme_sets", lang_to_morphemes)

100%|██████████| 17/17 [00:30<00:00,  1.81s/it]


In [52]:
print(f"mT5 vocab size: {len(mt5_vocab_set)}")
for lang, inf_der_morphemes in lang_to_morphemes.items():
    print(f"Language: {lang}")
    if "inflectional" in inf_der_morphemes:
        inf_morphemes = inf_der_morphemes["inflectional"]
        print(f"Inflectional morphemes in vocab: {len(mt5_vocab_set.intersection(inf_morphemes))}/{len(inf_morphemes)}")
    if "derivational" in inf_der_morphemes:
        der_morphemes = inf_der_morphemes["derivational"]
        print(f"Derivational morphemes in vocab: {len(mt5_vocab_set.intersection(der_morphemes))}/{len(der_morphemes)}")

mT5 vocab size: 250100
Language: deu
Inflectional morphemes in vocab: 677/2050
Derivational morphemes in vocab: 321/499
Language: pol
Derivational morphemes in vocab: 293/440
Language: hun
Derivational morphemes in vocab: 424/658
Language: hbs
Derivational morphemes in vocab: 261/356
Language: swe
Inflectional morphemes in vocab: 25/29
Derivational morphemes in vocab: 236/321
Language: spa
Derivational morphemes in vocab: 404/685
Language: cat
Inflectional morphemes in vocab: 41/58
Derivational morphemes in vocab: 183/237
Language: por
Derivational morphemes in vocab: 274/376
Language: ces
Inflectional morphemes in vocab: 80/92
Derivational morphemes in vocab: 261/349
Language: ita
Inflectional morphemes in vocab: 65/100
Derivational morphemes in vocab: 436/913
Language: fin
Inflectional morphemes in vocab: 81/99
Derivational morphemes in vocab: 228/425
Language: eng
Inflectional morphemes in vocab: 6/7
Derivational morphemes in vocab: 853/2254
Language: mon
Inflectional morphemes in v

In [51]:
print(f"gemma vocab size: {len(gemma_vocab_set)}")
for lang, inf_der_morphemes in lang_to_morphemes.items():
    print(f"Language: {lang}")
    if "inflectional" in inf_der_morphemes:
        inf_morphemes = inf_der_morphemes["inflectional"]
        print(f"Inflectional morphemes in vocab: {len(gemma_vocab_set.intersection(inf_morphemes))}/{len(inf_morphemes)}")
    if "derivational" in inf_der_morphemes:
        der_morphemes = inf_der_morphemes["derivational"]
        print(f"Derivational morphemes in vocab: {len(gemma_vocab_set.intersection(der_morphemes))}/{len(der_morphemes)}")

gemma vocab size: 256000
Language: deu
Inflectional morphemes in vocab: 573/2050
Derivational morphemes in vocab: 337/499
Language: pol
Derivational morphemes in vocab: 275/440
Language: hun
Derivational morphemes in vocab: 359/658
Language: hbs
Derivational morphemes in vocab: 202/356
Language: swe
Inflectional morphemes in vocab: 28/29
Derivational morphemes in vocab: 223/321
Language: spa
Derivational morphemes in vocab: 440/685
Language: cat
Inflectional morphemes in vocab: 45/58
Derivational morphemes in vocab: 192/237
Language: por
Derivational morphemes in vocab: 289/376
Language: ces
Inflectional morphemes in vocab: 80/92
Derivational morphemes in vocab: 228/349
Language: ita
Inflectional morphemes in vocab: 69/100
Derivational morphemes in vocab: 444/913
Language: fin
Inflectional morphemes in vocab: 79/99
Derivational morphemes in vocab: 205/425
Language: eng
Inflectional morphemes in vocab: 6/7
Derivational morphemes in vocab: 967/2254
Language: mon
Inflectional morphemes in

In [50]:
print(f"T5 vocab size: {len(t5_vocab_set)}")
for lang, inf_der_morphemes in lang_to_morphemes.items():
    print(f"Language: {lang}")
    if "inflectional" in inf_der_morphemes:
        inf_morphemes = inf_der_morphemes["inflectional"]
        print(f"Inflectional morphemes in vocab: {len(t5_vocab_set.intersection(inf_morphemes))}/{len(inf_morphemes)}")
    if "derivational" in inf_der_morphemes:
        der_morphemes = inf_der_morphemes["derivational"]
        print(f"Derivational morphemes in vocab: {len(t5_vocab_set.intersection(der_morphemes))}/{len(der_morphemes)}")

T5 vocab size: 32100
Language: deu
Inflectional morphemes in vocab: 296/2050
Derivational morphemes in vocab: 186/499
Language: pol
Derivational morphemes in vocab: 90/440
Language: hun
Derivational morphemes in vocab: 117/658
Language: hbs
Derivational morphemes in vocab: 76/356
Language: swe
Inflectional morphemes in vocab: 19/29
Derivational morphemes in vocab: 105/321
Language: spa
Derivational morphemes in vocab: 115/685
Language: cat
Inflectional morphemes in vocab: 18/58
Derivational morphemes in vocab: 76/237
Language: por
Derivational morphemes in vocab: 80/376
Language: ces
Inflectional morphemes in vocab: 24/92
Derivational morphemes in vocab: 74/349
Language: ita
Inflectional morphemes in vocab: 26/100
Derivational morphemes in vocab: 109/913
Language: fin
Inflectional morphemes in vocab: 24/99
Derivational morphemes in vocab: 53/425
Language: eng
Inflectional morphemes in vocab: 5/7
Derivational morphemes in vocab: 425/2254
Language: mon
Inflectional morphemes in vocab: 11