In [61]:
import tokenizers
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px

In [42]:
tokenizer_json_path = Path("Tokenizer jsons")
upos_bpe_tokenizer = tokenizers.Tokenizer.from_file(str(tokenizer_json_path / "upos_bpe_tokenizer.json"))
classic_bpe_tokenizer = tokenizers.Tokenizer.from_file(str(tokenizer_json_path / "classic_bpe_tokenizer.json"))

In [71]:
upos_bpe_df = pd.DataFrame({'UPOS': True}, index=upos_bpe_tokenizer.get_vocab().keys())
classic_bpe_df = pd.DataFrame({'CLASSIC': True}, index=classic_bpe_tokenizer.get_vocab().keys())
df = upos_bpe_df.join(classic_bpe_df, how='outer').convert_dtypes().fillna(False)
df['COMPARE'] = np.select(
    [
        df['UPOS'] & df['CLASSIC'],   # Common to both
        df['UPOS'],                   # Unique to UPOS
        df['CLASSIC']                 # Unique to CLASSIC
    ],
    [
        'BOTH',
        'UPOS',
        'CLASSIC'
    ],
    default='NONE'
)

In [123]:
df_treemap = df.reset_index()
df_treemap.rename(columns={'index': 'TOKEN'}, inplace=True)
df_treemap['CATEGORY SIZE'] = df_treemap.groupby('COMPARE')['TOKEN'].transform('count')
df_treemap['TOKEN LENGTH'] = df_treemap['TOKEN'].str.len()
df_treemap.sort_values(by=['TOKEN LENGTH'], ascending=False, inplace=True)
# print(df_treemap)

max_example_tokens = 32
df_treemap = (
    df_treemap.groupby('COMPARE')[['TOKEN', 'TOKEN LENGTH', 'COMPARE', 'CATEGORY SIZE']]
    .apply(lambda x: x.head(max_example_tokens))
    .reset_index(drop=True)
)
fig = px.treemap(
    df_treemap,
    path=['COMPARE', 'TOKEN'], 
    values='CATEGORY SIZE', 
    color='COMPARE',
    color_discrete_map={
        'BOTH': 'green',
        'UPOS': 'blue',
        'CLASSIC': 'orange',
        'NONE': 'gray'
    },
    title="Vocabulary Comparison: UPOS vs CLASSIC"
)
fig.add_annotation(
    text="Sized by group size",  # Subtitle text
    xref="paper",  # x and y coordinates relative to the paper (canvas)
    yref="paper",
    x=0,
    y=-0.15, 
    showarrow=False,
    font=dict(size=14, color="black"),
    align="left",
)
fig.show()

In [None]:
df_treemap = df.reset_index()
df_treemap.rename(columns={'index': 'TOKEN'}, inplace=True)
df_treemap['CATEGORY SIZE'] = df_treemap.groupby('COMPARE')['TOKEN'].transform('count')
df_treemap['TOKEN LENGTH'] = df_treemap['TOKEN'].str.len()
df_treemap.sort_values(by=['TOKEN LENGTH'], ascending=False, inplace=True)

max_example_tokens = 32
df_treemap = (
    df_treemap.groupby('COMPARE')[['TOKEN', 'TOKEN LENGTH', 'COMPARE', 'CATEGORY SIZE']]
    .apply(lambda x: x.head(max_example_tokens))
    .reset_index(drop=True)
)
fig = px.treemap(
    df_treemap,
    path=['COMPARE', 'TOKEN'], 
    values='TOKEN LENGTH', 
    color='COMPARE',
    color_discrete_map={
        'BOTH': 'green',
        'UPOS': 'blue',
        'CLASSIC': 'orange',
        'NONE': 'gray'
    },
    title="Vocabulary Comparison: UPOS vs CLASSIC "
)
fig.add_annotation(
    text="Sized by Token length",  # Subtitle text
    xref="paper",  # x and y coordinates relative to the paper (canvas)
    yref="paper",
    x=0,
    y=-0.15, 
    showarrow=False,
    font=dict(size=14, color="black"),
    align="left",
)
fig.show()