In [1]:
import sys
import os
from IPython import get_ipython

sys.path.append(os.path.abspath("./lib"))
if 'autoreload' not in get_ipython().magics_manager.magics['line']:
    %load_ext autoreload
%autoreload 2

from lib import dataloading as dl
from lib import tokenizer as tk
import torch
import tokenizers
import warnings
import numpy as np
import pandas as pd
from matplotlib_venn import venn2, venn3  
from matplotlib import pyplot as plt 
import plotly.express as px
from collections import defaultdict

In [2]:
data_df = dl.load_conllu(
    r"D:\Dropbox\Bachlorarbeit\Datasets\Universal Dependencies 2.15\ud-treebanks-v2.15\UD_English-GUM\en_gum-ud-train.conllu"
)
data_df = dl.clear_non_UPOS_tags(data_df)
print(data_df.head())

Dropped 2810 rows with non-UPOS tags 
Tags dropped: ['_']
            FORM         LEMMA   UPOS XPOS        FEATS HEAD DEPREL  \
ID                                                                    
1      Aesthetic     aesthetic    ADJ   JJ   Degree=Pos    2   amod   
2   Appreciation  appreciation   NOUN   NN  Number=Sing    0   root   
3            and           and  CCONJ   CC            _    5     cc   
4        Spanish       Spanish    ADJ   JJ   Degree=Pos    5   amod   
5            Art           art   NOUN   NN  Number=Sing    2   conj   

          DEPS                                               MISC  
ID                                                                 
1       2:amod  Discourse=organization-heading:1->57:8:grf-ly-...  
2       0:root                       Entity=1)|MSeg=Appreciat-ion  
3         5:cc                                                  _  
4       5:amod     Entity=(2-abstract-new-cf2-2-sgl|MSeg=Span-ish  
5   2:conj:and                      

In [3]:
data_df.describe()

Unnamed: 0,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC
count,164108,164108,164108,164108,164108,164108,164108,164108,164108
unique,17226,12957,17,47,181,106,51,10849,68967
top,",",",",NOUN,NN,_,4,punct,0:root,_
freq,8647,8647,27288,20260,53205,10049,22748,9409,62470


In [4]:
prefelance_df = pd.DataFrame(data_df["UPOS"].value_counts())
prefelance_df.columns = ["COUNT"]
prefelance_df["SPECIAL TOKENS"] = 5  # Special tokens [UNK], [PAD], [CLS], [SEP], [MASK]
prefelance_df["NORMALIZED SUM"] = prefelance_df["COUNT"] + prefelance_df["SPECIAL TOKENS"]
prefelance_df["NORMALIZED SUM"] = prefelance_df["NORMALIZED SUM"] / prefelance_df["NORMALIZED SUM"].sum()
print(prefelance_df)

       COUNT  SPECIAL TOKENS  NORMALIZED SUM
UPOS                                        
NOUN   27288               5        0.166225
PUNCT  22748               5        0.138575
VERB   17189               5        0.104718
ADP    15476               5        0.094285
PRON   13798               5        0.084066
DET    13328               5        0.081203
ADJ    10764               5        0.065587
PROPN   9528               5        0.058060
AUX     8821               5        0.053754
ADV     7839               5        0.047773
CCONJ   5390               5        0.032858
PART    3934               5        0.023990
NUM     3170               5        0.019337
SCONJ   2648               5        0.016158
INTJ    1603               5        0.009793
X        317               5        0.001961
SYM      267               5        0.001657


In [5]:
vocab_distribution = np.array(17 * [5]) # 5 special tokens
target = prefelance_df["NORMALIZED SUM"].values
to_be_added = tk.assign_proportionally(vocab_distribution, target, 1000 - vocab_distribution.sum())
vocab_distribution += to_be_added
print(vocab_distribution.sum())
print(to_be_added)
print(vocab_distribution)

1000
[161 133  99  89  79  76  60  53  48  42  27  19  14  11   4   0   0]
[166 138 104  94  84  81  65  58  53  47  32  24  19  16   9   5   5]


In [6]:
vocab_size = 1000
upos_tags = [
        "ADJ",
        "ADP",
        "ADV",
        "AUX",
        "CCONJ",
        "DET",
        "INTJ",
        "NOUN",
        "NUM",
        "PART",
        "PRON",
        "PROPN",
        "PUNCT",
        "SCONJ",
        "SYM",
        "VERB",
        "X",
    ]
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

In [None]:
def merge_upos_vocabularies(upos_tokenizers, upos_tags):
    assert len(upos_tokenizers) == len(upos_tags), "Mismatch between tokenizers and UPOS tags"

    token_upos_map = defaultdict(set)
    for upos_tag in upos_tags:
        for token in upos_tokenizers[upos_tag].get_vocab().keys():
            token_upos_map[token].add(upos_tag)

    vocab_df = pd.DataFrame([
        {"TOKEN": token, "UPOS": ", ".join(sorted(upos))}
        for token, upos in token_upos_map.items()
    ])
    return vocab_df

In [16]:
# TODO: Don't keep count, use large vocab, extract merges, make as many as vocab_size allows

In [14]:
vocab_size = 1000
# Initial Training
upos_tokenizers = {}
upos_prevalence = data_df["UPOS"].value_counts(normalize=True)
target_distribution = np.array(len(upos_tags) * [len(special_tokens)]) # Reserve space for special tokens
target_distribution += tk.assign_proportionally(target_distribution,
                                                upos_prevalence,
                                                vocab_size - target_distribution.sum())
for i, upos_tag in enumerate(upos_tags):
    text = data_df[data_df["UPOS"] == upos_tag]["FORM"]
    upos_tokenizers[upos_tag] = tk.train_tokenizer(text, target_distribution[i], special_tokens)

vocab_df = merge_upos_vocabularies(upos_tokenizers, upos_tags)

while len(vocab_df) <= vocab_size:
    if len(vocab_df) > vocab_size:
        print("Vocab size exceeded")
        break
    to_be_added += tk.assign_proportionally(target_distribution, upos_prevalence, vocab_size - len(vocab_df))
    for i, upos_tag in enumerate(upos_tags):
            text = data_df[data_df["UPOS"] == upos_tag]["FORM"]
            upos_tokenizers[upos_tag] = tk.train_tokenizer(text, to_be_added[i], special_tokens, tokenizer=upos_tokenizers[upos_tag])
        
    vocab_df = merge_upos_vocabularies(upos_tokenizers, upos_tags)
    break

print(upos_tokenizers)
print(vocab_df)

{'ADJ': Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"[UNK]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":1, "content":"[PAD]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":2, "content":"[CLS]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":3, "content":"[SEP]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":4, "content":"[MASK]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}], normalizer=Sequence(normalizers=[NFD(), Lowercase(), StripAccents()]), pre_tokenizer=Sequence(pretokenizers=[Metaspace(replacement="▁", prepend_scheme=always, split=True)]), post_processor=TemplateProcessing(single=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0)], pair=[Specia

In [None]:
upos_tokenizers = {}
vocab_df = pd.DataFrame()
running_id = 0
for upos, vocab in individual_vocab_size.items():
    text = data_df[data_df["UPOS"] == upos]["FORM"]
    upos_tokenizers[upos] = tk.train_tokenizer(text, vocab)
    next_df = pd.DataFrame(
        [(idx + running_id, token, upos) for token, idx in upos_tokenizers[upos].get_vocab().items()],
        columns=["ID", "TOKEN", "UPOS"]
    ).set_index("ID").sort_index()
    vocab_df = pd.concat([vocab_df, next_df])
    running_id += vocab


Individual vocab size: {'NOUN': 166, 'PUNCT': 138, 'VERB': 104, 'ADP': 94, 'PRON': 84, 'DET': 81, 'ADJ': 65, 'PROPN': 58, 'AUX': 53, 'ADV': 47, 'CCONJ': 32, 'PART': 23, 'NUM': 19, 'SCONJ': 16, 'INTJ': 9, 'X': 1, 'SYM': 1}
Total vocab size: 991


In [None]:
vocab_df_backup = vocab_df.copy()

In [None]:
vocab_df_groups = vocab_df.groupby("TOKEN")["UPOS"].apply(lambda x: ", ".join(sorted(set(x)))).reset_index()

In [None]:
vocab_df_groups

Unnamed: 0,TOKEN,UPOS
0,!,"PROPN, PUNCT"
1,"""","NOUN, PUNCT"
2,#,"NUM, PROPN, SYM"
3,$,"NOUN, NUM, SYM"
4,%,SYM
...,...,...
433,空,X
434,葉,X
435,谿,X
436,远,X


In [None]:
counts = vocab_df_groups.groupby("UPOS").size().reset_index(name="count")

In [None]:
vocab_df["count"] = 1  # one per token
fig = px.treemap(vocab_df,
                 path=["UPOS", "TOKEN"],
                 values="count",
                 title=f"Tokens by UPOS from Georgetown University Multilayer corpus with {len(vocab_df)} tokens",)

fig.show()

In [None]:
vocab_df_groups["count"] = 1  # one per token
fig = px.treemap(vocab_df_groups[(vocab_df_groups["TOKEN"].str.len() > 1) & (vocab_df_groups["UPOS"].str.len() > 5)],
                 path=["UPOS", "TOKEN"],
                 values="count",
                 title=f"Overlap of tokens with atleast 2 characters")

fig.show()

In [None]:
compare_vocab_df = vocab_df_backup.copy()
full_corpus_bpe = tk.train_tokenizer(data_df["FORM"], 1000)
full_corpus_vocab = pd.DataFrame(
        [(idx + vocab_df.iloc[-1].name + 1, token, "ANY") for token, idx in full_corpus_bpe.get_vocab().items()],
        columns=["ID", "TOKEN", "UPOS"]
    ).set_index("ID").sort_index()
full_corpus_vocab = pd.concat([full_corpus_vocab, vocab_df_backup.copy()]).sort_index()

In [None]:
print(vocab_df.iloc[-1].name)

1028


In [None]:
df = full_corpus_vocab.copy()
df["COUNT"] = 1
any_mask = df["UPOS"] == "ANY"
upos_mask = df["UPOS"] != "ANY"
df.loc[any_mask, "OVERLAP"] = df[df["UPOS"] == "ANY"]["TOKEN"].isin(df[df["UPOS"] != "ANY"]["TOKEN"])
df.loc[any_mask, "OVERLAP"] = df["OVERLAP"].map({
    True: "Common to both tokenizers",
    False: "Unique to classic tokenizer"  
})
df.loc[upos_mask, "OVERLAP"] = df[df["UPOS"] != "ANY"]["TOKEN"].isin(df[df["UPOS"] == "ANY"]["TOKEN"])
df.loc[upos_mask, "OVERLAP"] = df["OVERLAP"].map({
    True: "Common to both tokenizers",
    False: "Unique to POS-specific tokenizer"  
})
df.to_csv('your_dataframe.csv', index=False)
print(df)
fig = px.treemap(vocab_df_groups[(vocab_df_groups["TOKEN"].str.len() > 1) & (vocab_df_groups["UPOS"].str.len() > 5)],
                 path=["UPOS", "TOKEN"],
                 values="count",
                 title=f"Overlap of tokens with atleast 2 characters")

fig.show()

ValueError: cannot reindex on an axis with duplicate labels

In [None]:
df = full_corpus_vocab.copy()
duplicates = df[df.index.duplicated(keep=False)]
print(duplicates)

      TOKEN   UPOS
ID                
790       ʁ  PROPN
790   [UNK]    AUX
791   [PAD]    AUX
791       ʃ  PROPN
792       ʊ  PROPN
...     ...    ...
1065      [    ANY
1066      ]    ANY
1066      远      X
1067      静      X
1067      _    ANY

[287 rows x 2 columns]


In [None]:
test = vocab_df[vocab_df["UPOS"].isin(["NOUN", "VERB"])]
test.groupby("TOKEN")["UPOS"]
test

Unnamed: 0_level_0,TOKEN,UPOS,count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,[UNK],NOUN,1
1,[PAD],NOUN,1
2,[CLS],NOUN,1
3,[SEP],NOUN,1
4,[MASK],NOUN,1
...,...,...,...
403,ad,VERB,1
404,ook,VERB,1
405,un,VERB,1
406,▁get,VERB,1


In [None]:
df = px.data.tips()
fig = px.treemap(df, path=[px.Constant("all"), 'day', 'time', 'sex'], values='total_bill')
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [None]:
fig = px.treemap(vocab_df, path=['UPOS'], values='TOKEN')
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

ValueError: Column `TOKEN` of `df` could not be converted to a numerical data type.

In [None]:
# Example: Assuming df has columns ["token", "id", "UPOS"]
nouns = set(vocab_df[vocab_df["UPOS"] == "NOUN"]["TOKEN"])
verbs = set(vocab_df[vocab_df["UPOS"] == "VERB"]["TOKEN"])
overlap = nouns & verbs
only_nouns = nouns - verbs
only_verbs = verbs - nouns

# Plot
plt.figure(figsize=(10, 8))
v = venn2([nouns, verbs], set_labels=("NOUN", "VERB"))

# Helper to scatter tokens within a circle region
def scatter_text(tokens, center, radius=0.2, label=""):
    angle_step = 2 * np.pi / max(len(tokens), 1)
    for i, token in enumerate(sorted(tokens)):
        angle = i * angle_step
        x = center[0] + radius * np.cos(angle)
        y = center[1] + radius * np.sin(angle)
        plt.text(x, y, token, ha='center', va='center', fontsize=10)

# Circle centers from default venn2 layout (approximate)
# You can tweak these if needed
centers = {
    '10': (-0.6, 0),     # Only NOUN
    '01': (0.6, 0),      # Only VERB
    '11': (0, 0)         # Overlap
}

# Add tokens spread within each region
scatter_text(only_nouns, centers['10'], radius=0.3)
scatter_text(only_verbs, centers['01'], radius=0.3)
scatter_text(overlap, centers['11'], radius=0.2)

plt.title("Token Overlap with Spread Tokens")
plt.axis('off')
plt.show()

In [None]:
pd.DataFrame(
    [(idx, token, upos) for token, idx in upos_tokenizers[upos].get_vocab().items()],
    columns=["ID", "TOKEN", "UPOS"]
).set_index("ID").sort_index()

Unnamed: 0_level_0,TOKEN,UPOS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,[UNK],SYM
1,[PAD],SYM
2,[CLS],SYM
3,[SEP],SYM
4,[MASK],SYM
5,#,SYM
6,$,SYM
7,%,SYM
8,),SYM
9,*,SYM


In [None]:
whole_corpus_bpe = tk.train_tokenizer(data_df["FORM"], vocab_size)
encoded = whole_corpus_bpe.encode(str(data_df["FORM"][:40].values.tolist()))
print("Token splits: ", encoded.tokens)
print("Token IDs: ", encoded.ids)
print("Decoded: ", whole_corpus_bpe.decode(encoded.ids, skip_special_tokens=True))
print("Vocabulary size: ", whole_corpus_bpe.get_vocab_size())

Token splits:  ['[CLS]', '▁[', "'", 'a', 'es', 'th', 'et', 'ic', "'", ',', "▁'", 'a', 'pp', 're', 'ci', 'ation', "'", ',', "▁'", 'and', "'", ',', "▁'s", 'p', 'an', 'ish', "'", ',', "▁'", 'art', "'", ',', "▁'", ':', "'", ',', "▁'", 'in', 's', 'ight', 's', "'", ',', "▁'", 'f', 'r', 'om', "'", ',', "▁'", 'e', 'y', 'e', "'", ',', "▁'", '-', "'", ',', "▁'", 't', 'ra', 'c', 'king', "'", ',', "▁'", 'c', 'l', 'a', 'ire', "'", ',', "▁'", 'b', 'a', 'ile', 'y', "'", ',', "▁'", '-', "'", ',', "▁'", 'ross', "'", ',', "▁'", 'c', 'l', 'a', 'ire', '.', 'b', 'a', 'ile', 'y', '-', 'ross', '@', 'p', 'ort', '.', 'ac', '.', 'u', 'k', "'", ',', "▁'", 'un', 'i', 'vers', 'ity', "'", ',', "▁'", 'o', 'f', "'", ',', "▁'", 'p', 'ort', 's', 'm', 'ou', 'th', "'", ',', "▁'", ',', "'", ',', "▁'", 'un', 'ited', "'", ',', "▁'", 'king', 'd', 'om', "'", ',', "▁'", 'and', 're', 'w', "'", ',', "▁'", 'b', 'ere', 's', 'f', 'ord', "'", ',', "▁'", 'a', '.', 'm', '.', 'b', 'ere', 's', 'f', 'ord', '@', 'd', 'ur', 'h', 'am', '.',