In [24]:
import tokenizers
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px

from lib import dataloading as dl
from lib import tokenizer as tk

In [25]:
tokenizer_json_path = Path("Tokenizer jsons")
upos_bpe_tokenizer = tokenizers.Tokenizer.from_file(str(tokenizer_json_path / "upos_bpe_tokenizer.json"))
classic_bpe_tokenizer = tokenizers.Tokenizer.from_file(str(tokenizer_json_path / "classic_bpe_tokenizer.json"))

In [26]:
upos_bpe_df = pd.DataFrame({'UPOS': True}, index=upos_bpe_tokenizer.get_vocab().keys())
classic_bpe_df = pd.DataFrame({'CLASSIC': True}, index=classic_bpe_tokenizer.get_vocab().keys())
df = upos_bpe_df.join(classic_bpe_df, how='outer').convert_dtypes().fillna(False)
df['COMPARE'] = np.select(
    [
        df['UPOS'] & df['CLASSIC'],   # Common to both
        df['UPOS'],                   # Unique to UPOS
        df['CLASSIC']                 # Unique to CLASSIC
    ],
    [
        'BOTH',
        'UPOS',
        'CLASSIC'
    ],
    default='NONE'
)

In [27]:
df_treemap = df.reset_index()
df_treemap.rename(columns={'index': 'TOKEN'}, inplace=True)
df_treemap['CATEGORY SIZE'] = df_treemap.groupby('COMPARE')['TOKEN'].transform('count')
df_treemap['TOKEN LENGTH'] = df_treemap['TOKEN'].str.len()
df_treemap.sort_values(by=['TOKEN LENGTH'], ascending=False, inplace=True)
# print(df_treemap)

max_example_tokens = 32
df_treemap = (
    df_treemap.groupby('COMPARE')[['TOKEN', 'TOKEN LENGTH', 'COMPARE', 'CATEGORY SIZE']]
    .apply(lambda x: x.head(max_example_tokens))
    .reset_index(drop=True)
)
fig = px.treemap(
    df_treemap,
    path=['COMPARE', 'TOKEN'], 
    values='CATEGORY SIZE', 
    color='COMPARE',
    color_discrete_map={
        'BOTH': 'green',
        'UPOS': 'blue',
        'CLASSIC': 'orange',
        'NONE': 'gray'
    },
    title="Vocabulary Comparison: UPOS vs CLASSIC"
)
fig.add_annotation(
    text=f"Sized by group size, showing top {max_example_tokens} tokens order by length",
    xref="paper",
    yref="paper",
    x=0.005,
    y=-0.15, 
    showarrow=False,
    font=dict(size=14, color="black"),
    align="left",
)
fig.show()

In [28]:
df_treemap = df.reset_index()
df_treemap.rename(columns={'index': 'TOKEN'}, inplace=True)
df_treemap['CATEGORY SIZE'] = df_treemap.groupby('COMPARE')['TOKEN'].transform('count')
df_treemap['TOKEN LENGTH'] = df_treemap['TOKEN'].str.len()
df_treemap.sort_values(by=['TOKEN LENGTH'], ascending=False, inplace=True)

max_example_tokens = 32
df_treemap = (
    df_treemap.groupby('COMPARE')[['TOKEN', 'TOKEN LENGTH', 'COMPARE', 'CATEGORY SIZE']]
    .apply(lambda x: x.head(max_example_tokens))
    .reset_index(drop=True)
)
fig = px.treemap(
    df_treemap,
    path=['COMPARE', 'TOKEN'], 
    values='TOKEN LENGTH', 
    color='COMPARE',
    color_discrete_map={
        'BOTH': 'green',
        'UPOS': 'blue',
        'CLASSIC': 'orange',
        'NONE': 'gray'
    },
    title="Vocabulary Comparison: UPOS vs CLASSIC "
)
fig.add_annotation(
    text=f"Sized by Token length, showing top {max_example_tokens} tokens ordered by length",  
    xref="paper",
    yref="paper",
    x=0.005,
    y=-0.15, 
    showarrow=False,
    font=dict(size=14, color="black"),
    align="left",
)
fig.show()

In [29]:
data_df = dl.load_conllu(
    r"D:\Dropbox\Bachlorarbeit\Datasets\Universal Dependencies 2.15\ud-treebanks-v2.15\UD_English-GUM\en_gum-ud-test.conllu"
)
data_df = dl.clear_non_UPOS_tags(data_df)
print(data_df.head())

Dropped 326 rows with non-UPOS tags 
Tags dropped: ['_']
              FORM           LEMMA  UPOS XPOS                      FEATS HEAD  \
ID                                                                              
1              The             the   DET   DT  Definite=Def|PronType=Art    2   
2       prevalence      prevalence  NOUN   NN                Number=Sing    0   
3               of              of   ADP   IN                          _    4   
4   discrimination  discrimination  NOUN   NN                Number=Sing    2   
5           across          across   ADP   IN                          _    7   

   DEPREL       DEPS                                               MISC  
ID                                                                       
1     det      2:det  Discourse=organization-heading:1->38:5:grf-ly-...  
2    root     0:root                                   MSeg=preval-ence  
3    case     4:case                                                  _  
4    

In [30]:
print(upos_bpe_tokenizer.token_to_id(upos_bpe_tokenizer.model.unk_token))


572


In [31]:
upos_encoded = [upos_bpe_tokenizer.encode(text) for text in data_df['FORM'].values.tolist()]
classic_encoded = [classic_bpe_tokenizer.encode(text) for text in data_df['FORM'].values.tolist()]

upos_encoded_cleaned = [enc.tokens[1:-1] for enc in upos_encoded]
classic_encoded_cleaned = [enc.tokens[1:-1] for enc in classic_encoded]

upos_unk_id = upos_bpe_tokenizer.token_to_id(upos_bpe_tokenizer.model.unk_token)
classic_unk_id = classic_bpe_tokenizer.token_to_id(classic_bpe_tokenizer.model.unk_token)
upos_num_unk = sum(1 for ids in upos_encoded if ids == upos_unk_id)
classic_num_unk = sum(1 for ids in classic_encoded if ids == classic_unk_id)

upos_lengths = [len(enc.tokens) for enc in upos_encoded]
classic_lengths = [len(enc.tokens) for enc in classic_encoded]

In [32]:
print(upos_unk_id in [ids.ids for ids in upos_encoded])
print(classic_num_unk in [ids.ids for ids in classic_encoded])

False
False


In [33]:
test = upos_bpe_tokenizer.encode("🤗")
print(test.tokens)
print(upos_unk_id in [ids for ids in test.ids])
test = classic_bpe_tokenizer.encode("🤗")
print(test.tokens)
print(classic_unk_id in [ids for ids in test.ids])

print(upos_bpe_tokenizer.encode("the").tokens)

upos_bpe_tokenizer.encode("The").tokens


['[CLS]', '▁', '[UNK]', '[SEP]']
True
['[CLS]', '▁', '[UNK]', '[SEP]']
True
['[CLS]', '▁the', '[SEP]']


['[CLS]', '▁the', '[SEP]']

In [34]:
stats_df = pd.DataFrame({
    "Text": data_df['FORM'],
    "UPOS Encoded": upos_encoded_cleaned,
    "CLASSIC Encoded": classic_encoded_cleaned,
    "UPOS_Tokens": upos_lengths,
    "CLASSIC_Tokens": classic_lengths
})

# Calculate some summary statistics
summary = pd.DataFrame({
    "Tokenizer": ["UPOS", "CLASSIC"],
    "Mean Tokens": [sum(upos_lengths) / len(upos_lengths),
                    sum(classic_lengths) / len(classic_lengths)],
    "Total Tokens": [sum(upos_lengths), sum(classic_lengths)],
    "Mean UNK Tokens": [upos_num_unk / len(upos_encoded), classic_num_unk / len(classic_encoded)],
    "Total UNK Tokens": [upos_num_unk, classic_num_unk]
})
# Round to the last two decimal places
summary[["Mean Tokens", "Mean UNK Tokens"]] = summary[["Mean Tokens", "Mean UNK Tokens"]].round(2)

print("Per-Text Token Counts:")
print(stats_df)
print("\nSummary:")
print(summary)

Per-Text Token Counts:
              Text               UPOS Encoded              CLASSIC Encoded  \
ID                                                                           
1              The                     [▁the]                       [▁the]   
2       prevalence    [▁, pr, e, v, al, ence]          [▁pre, v, al, ence]   
3               of                      [▁of]                        [▁of]   
4   discrimination  [▁disc, ri, m, in, ation]  [▁dis, c, r, im, in, ation]   
5           across              [▁ac, ro, ss]                  [▁ac, ross]   
..             ...                        ...                          ...   
14            with                    [▁with]                      [▁with]   
15            your                    [▁your]                      [▁your]   
16         nesting             [▁n, est, ing]               [▁n, est, ing]   
17             box                     [▁box]                     [▁bo, x]   
18               .                       