In [36]:
import sys
import os
from IPython import get_ipython

sys.path.append(os.path.abspath("./lib"))
if 'autoreload' not in get_ipython().magics_manager.magics['line']:
    %load_ext autoreload
%autoreload 2

from lib import dataloading as dl
from lib import tokenizer as tk
import torch
import tokenizers
import warnings
import numpy as np
import pandas as pd
from matplotlib_venn import venn2, venn3  
from matplotlib import pyplot as plt 
import plotly.express as px
from collections import defaultdict

In [37]:
data_df = dl.load_conllu(
    r"D:\Dropbox\Bachlorarbeit\Datasets\Universal Dependencies 2.15\ud-treebanks-v2.15\UD_English-GUM\en_gum-ud-train.conllu"
)
data_df = dl.clear_non_UPOS_tags(data_df)
print(data_df.head())

Dropped 2810 rows with non-UPOS tags 
Tags dropped: ['_']
            FORM         LEMMA   UPOS XPOS        FEATS HEAD DEPREL  \
ID                                                                    
1      Aesthetic     aesthetic    ADJ   JJ   Degree=Pos    2   amod   
2   Appreciation  appreciation   NOUN   NN  Number=Sing    0   root   
3            and           and  CCONJ   CC            _    5     cc   
4        Spanish       Spanish    ADJ   JJ   Degree=Pos    5   amod   
5            Art           art   NOUN   NN  Number=Sing    2   conj   

          DEPS                                               MISC  
ID                                                                 
1       2:amod  Discourse=organization-heading:1->57:8:grf-ly-...  
2       0:root                       Entity=1)|MSeg=Appreciat-ion  
3         5:cc                                                  _  
4       5:amod     Entity=(2-abstract-new-cf2-2-sgl|MSeg=Span-ish  
5   2:conj:and                      

In [38]:
vocab_size = 1000
upos_tags = [
        "ADJ",
        "ADP",
        "ADV",
        "AUX",
        "CCONJ",
        "DET",
        "INTJ",
        "NOUN",
        "NUM",
        "PART",
        "PRON",
        "PROPN",
        "PUNCT",
        "SCONJ",
        "SYM",
        "VERB",
        "X",
    ]
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

In [39]:
data_df["UPOS"].value_counts(normalize=True)

UPOS
NOUN     0.166281
PUNCT    0.138616
VERB     0.104742
ADP      0.094304
PRON     0.084079
DET      0.081215
ADJ      0.065591
PROPN    0.058059
AUX      0.053751
ADV      0.047767
CCONJ    0.032844
PART     0.023972
NUM      0.019317
SCONJ    0.016136
INTJ     0.009768
X        0.001932
SYM      0.001627
Name: proportion, dtype: float64

In [40]:
tokenizers_upos = {}
merges_upos = {}
vocab_upos = {}
vocab_size = 1000
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
for upos_tag in upos_tags:
    text = data_df[data_df["UPOS"] == upos_tag]["FORM"].values.tolist()
    tokenizers_upos[upos_tag] = tk.train_tokenizer(text, vocab_size)
    vocab_upos[upos_tag], merges_upos[upos_tag] = tk.extract_vocab_and_merges(tokenizers_upos[upos_tag])

target_allocation = data_df["UPOS"].value_counts(normalize=True).sort_index() # Sort by index to match upos_tags order
vocab_allocation = np.array([5] * len(upos_tags))  # Ensure space for five special tokens

vocab_set = set()
while len(vocab_set) < vocab_size:
    vocab_allocation += tk.assign_proportionally(vocab_allocation, target_allocation, vocab_size - len(vocab_set))
    for idx, upos_tag in enumerate(upos_tags):
        print(f"Vocab allocation for {upos_tag}: {list(vocab_upos[upos_tag])[:vocab_allocation[idx]]}")
        vocab_set.update(list(vocab_upos[upos_tag])[:vocab_allocation[idx]])

merges_set = set()
for upos_tag in upos_tags:
    for merge in merges_upos[upos_tag]:
        if all(token in vocab_set for token in [merge[0], merge[1], merge[0] + merge[1]]):
            merges_set.add(merge)

print(f"Total merges: {len(merges_set)}")
print(merges_set)
print(f"Total vocab size: {len(vocab_set)}")
print(vocab_set)

Vocab allocation for ADJ: ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]', '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '²', '–', '▁', 'al', 'er', '▁s', 're', 'ti', 'an', 'le', 'st', '▁m', 'on', 'ar', 'en', 'ic', '▁f', 'in', '▁c', '▁p', '▁l', 'or', '▁o', '▁n']
Vocab allocation for ADP: ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]', '.', '@', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', '▁', '▁o', '▁of', 'in', '▁in', '▁a', '▁t', '▁f', '▁to', 'or', '▁for', '▁on', 'it', 'wit', '▁wit', '▁with', 'ro', '▁b', '▁at', '▁fro', '▁from', '▁by', 'ut', '▁as', '▁u', 'er', '▁th', 'to', '▁up', 'out', '▁out', '▁into', 'ik', 'lik', 'like', '▁like', '▁be', 'rou', '▁ab', '▁about', 'nd', '▁d', 'ver', '▁over', 'gh', '▁throu', '▁through', 'an', '▁than', 'on', 'en', 'een', 'tw', '▁

In [41]:
# This only works for strict assignment of vocab size not current
print(f"Vocab allocation for vocab size {vocab_size}:")
df_vocab_allocation = pd.DataFrame(
    {
        "UPOS": upos_tags,
        "Vocab Size": vocab_allocation,
        "Proportion All": vocab_allocation / vocab_size,
        "Proportion Unique": (vocab_allocation - 5) / (sum(vocab_allocation) - (len(upos_tags) * 5)) # Account for special tokens
    }
)
df_vocab_allocation

Vocab allocation for vocab size 1000:


Unnamed: 0,UPOS,Vocab Size,Proportion All,Proportion Unique
0,ADJ,88,0.088,0.041028
1,ADP,175,0.175,0.084034
2,ADV,51,0.051,0.022739
3,AUX,58,0.058,0.026199
4,CCONJ,35,0.035,0.014829
5,DET,131,0.131,0.062284
6,INTJ,10,0.01,0.002472
7,NOUN,646,0.646,0.316856
8,NUM,20,0.02,0.007415
9,PART,26,0.026,0.010381


In [42]:
normalizer = tokenizers.normalizers.Sequence([
    tokenizers.normalizers.NFD(),  # Unicode Normalizer
    tokenizers.normalizers.Lowercase(),
    tokenizers.normalizers.StripAccents(),
])
# Get all unique characters from FORM column
all_chars = set()
for form in data_df["FORM"].values:
    form = normalizer.normalize_str(form)
    all_chars.update(list(form))

# Find characters that aren't in vocab_set
missing_chars = all_chars - vocab_set

# Display results
print(f"Total unique characters in FORM: {len(all_chars)}")
print(f"Characters not in vocab_set: {len(missing_chars)}")
print("Missing characters:")
print(sorted(missing_chars))

Total unique characters in FORM: 136
Characters not in vocab_set: 48
Missing characters:
['%', '<', '£', '§', '½', '×', 'ß', 'ν', 'ο', 'ς', 'σ', 'υ', 'φ', 'ω', 'ا', 'ح', 'د', 'ز', 'ص', 'ف', 'ن', 'ه', 'ي', '€', '☎', '☏', '✉', '一', '况', '口', '古', '司', '和', '图', '夏', '奈', '子', '山', '樋', '津', '清', '澹', '琴', '空', '葉', '谿', '远', '静']


In [43]:
vocab = {token: idx for idx, token in enumerate(vocab_set)}
merges = list(merges_set)

In [46]:
tokenizer = tk.tokenizer_from_vocab_and_merges("bpe", vocab, merges, save_path=r"Tokenizer jsons/upos_bpe_tokenizer.json")

In [48]:
vocab_size = 1000
tokenizer = tk.train_tokenizer(data_df["FORM"].values.tolist(), vocab_size)
tokenizer.save(r"Tokenizer jsons/classic_bpe_tokenizer.json")