In [1]:
%cd /home/yali/MEGA/Hack The Tockenizer/notebooks

/home/yali/MEGA/Hack The Tockenizer/notebooks


In [2]:
# %cd "C:\Users\yakim\Documents\MEGA\03. Vida Académica\03. Mestrado Ciencias Computadores\Dissertacao\Hack The Tockenizer\notebooks"

In [3]:
import os
import json
import datetime as dt
from io import StringIO
import ipywidgets
from IPython.display import display, HTML
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import tqdm
import numpy as np
import pandas as pd

from pathlib import Path

output_dir = Path(os.getcwd()).parent / "outputs"

# Set plotly as the default plotting backend
pd.options.plotting.backend = "plotly"

In [4]:
dfs: dict[str|int, pd.DataFrame] = {}
results: dict[str, dict] = {}
for file in tqdm.tqdm(os.listdir(output_dir), desc='Loading files'):
    if file.endswith("parquet"):
        key = dt.datetime.strptime(str(file), 'analysis_%Y%m%d%H%M%S.parquet').strftime('analysis@%Y-%m-%d %H:%M:%S')
        dfs[key] = pd.read_parquet(output_dir / file)
        dfs[len(dfs.keys()) // 2] = dfs[key]   # Save a copy with "num index" for easier access

        # Convert Categorical columns back to "number" or "string"
        for col in dfs[key].select_dtypes(include=['category']).columns:
            # Try converting to numeric first, if that fails convert to string
            try:
                dfs[key][col] = dfs[key][col].astype(int)
            except:
                dfs[key][col] = dfs[key][col].astype(str)

        # Convert unsigned integer columns to regular integers
        for col in dfs[key].select_dtypes(include=[np.unsignedinteger]).columns:
            dfs[key][col] = dfs[key][col].astype(int)
    elif file.endswith("json"):
        with open(output_dir / file, 'r', encoding='utf-8') as f:
            key = dt.datetime.strptime(str(file), 'results_%Y%m%d%H%M%S.json').strftime('results@%Y-%m-%d %H:%M:%S')
            results[key] = json.load(f)

Loading files: 100%|██████████| 16/16 [00:09<00:00,  1.61it/s]


# Defining Analysis Functions

In [5]:
# Log Scale menus
log_button = lambda x, y, axis: dict(
    type="buttons",
    x=x,
    y=y,
    active=0,  # Set initial state (0 for linear)
    buttons=[
        dict(
            label="Log (X-Axis)",
            method="relayout",
            args=[{f"{axis}.type": "linear"}],
            args2=[{f"{axis}.type": "log"}]
        )
    ]
)

In [6]:
def get_rank_comparison(df: pd.DataFrame, *__, width=1000, **_):

    new_tokens = df.groupby(by=['new_token_rank'], as_index=False)[['new_token_id']].count()
    old_tokens = df.groupby(by=['old_token_rank'], as_index=False)[['new_token_id']].count() 

    new_tokens.columns = ['rank', '#new_token']
    old_tokens.columns = ['rank', '#old_token']

    df = new_tokens.merge(old_tokens, how='outer').fillna(0)
    df = df.sort_values(by='rank', ascending=True).reset_index(drop=True)
    df['#new_token_acc'] = df['#new_token'].cumsum()
    df['#old_token_acc'] = df['#old_token'].cumsum()

    fig = make_subplots(rows=1, cols=2, subplot_titles=['Rank Distribution', 'Rank Accumulative Distribution'])
    # Plotting the normal view
    fig_normal = df.plot(x='rank', y=['#new_token', '#old_token'], title='Rank Distribution')

    # Plotting accumulative view
    fig_acc = df.plot(x='rank', y=['#new_token_acc', '#old_token_acc'], title='Rank Acc Distribution')


    # Add traces to subplot
    for trace in fig_normal.data:
        fig.add_trace(trace, row=1, col=1)
    for trace in fig_acc.data:
        fig.add_trace(trace, row=1, col=2)

    # Update layout
    button_width = 0.30
    button_coords = (0.12, 1.15)
    fig.update_layout(
        title_text="Rank Distribution [NewTokens Vs OldTokens]",
        showlegend=True,
        width=width*2.2,
        updatemenus = [
            log_button(button_coords[0], button_coords[1], 'xaxis'),
            log_button(button_coords[0]+button_width*1, button_coords[1], 'yaxis'),
            log_button(button_coords[0]+button_width*1 + 0.25, button_coords[1], 'xaxis2'),
            log_button(button_coords[0]+button_width*2 + 0.25, button_coords[1], 'yaxis2'),
        ] # type: ignore
    )
    return [fig]

def get_logit_comparison(df: pd.DataFrame, *__, width=1000, **_):
    new_tokens = df.groupby(by=['new_token_logits'], as_index=False)[['new_token_id']].count()
    old_tokens = df.groupby(by=['old_token_logits'], as_index=False)[['new_token_id']].count() 

    new_tokens.columns = ['logits', '#new_token']
    old_tokens.columns = ['logits', '#old_token']

    df = new_tokens.merge(old_tokens, how='outer').fillna(0)
    df = df.sort_values(by='logits', ascending=True).reset_index(drop=True)

    # Plotting the normal view
    fig = df.plot(x='logits', y=['#new_token', '#old_token'], title='Logits Distribution')

    # Update layout
    button_width = 0.30
    button_coords = (0.12, 1.15)
    fig.update_layout(
        title_text="Logits Distribution [NewTokens Vs OldTokens]",
        showlegend=True,
        width=width*2.2,
        updatemenus = [
            log_button(button_coords[0], button_coords[1], 'xaxis'),
            log_button(button_coords[0]+button_width*1, button_coords[1], 'yaxis'),
        ] # type: ignore
    )
    return [fig]

import plotly.graph_objects as go

def get_rank_diff_whiskers(df: pd.DataFrame, *__, width=1000, height=400, **_):
    # Calculate rank differences for all models
    df = df.copy()
    df['rank_diff'] = df['new_token_rank'] - df['old_token_rank']
    
    # Get unique models and sort them for consistent ordering
    models = sorted(df['model'].unique())
    
    figures = []
    
    # Create one vertical box plot per model
    for model in models:
        model_data = df[df['model'] == model]
        
        fig = go.Figure()
        
        fig.add_trace(go.Box(
            y=model_data['rank_diff'],
            name=model,
            boxpoints='outliers',
            marker_color='rgb(8,81,156)',
            line_color='rgb(8,81,156)'
        ))
        
        # Update layout for this figure
        fig.update_layout(
            title_text=f"Rank Difference: {model} (New - Old Rank)",
            showlegend=False,
            width=width,
            height=height,
            margin=dict(t=50, b=50, l=50, r=50),
            yaxis_title="Rank Difference"
        )
        
        # Add horizontal reference line at y=0
        fig.add_hline(y=0, line_dash="dash", line_color="red")
        
        figures.append(fig)
    
    
    fig = make_subplots(rows=1, cols=len(figures), subplot_titles=['Rank Distribution', 'Rank Accumulative Distribution'])

    # Add traces to subplot
    for col, figure in enumerate(figures):
        for trace in figure.data:
            fig.add_trace(trace, row=1, col=col+1)
    return [fig]

def get_metrics_aux(result, version):
    output = []
    for model_type, results in result['RESULTS'].items():
        output.append({'model': result['RUN_CONFIGS']['model_name'], 'version': version, 'model_type': model_type}) 
        for metric in results['Metrics'].keys():
            output[-1][metric] = results['Metrics'][metric]
        for benchmark in results['Benchmarks'].keys():
            output[-1][benchmark] = results['Benchmarks'][benchmark]['result']
    return pd.DataFrame(output)
def get_metrics(result, version, *_, **__):
    df = get_metrics_aux(result, version)
    display(ipywidgets.HTML(df.to_html(index=False)))


In [7]:
def run_analysis(df, result: dict, *args, **kwargs):
    display(ipywidgets.HTML('<h3>Run Config</h3>\n<ul>{}</ul><h3>Metrics</h3>'.format('\n'.join([f'<li>{k}: {v}</li>' for k, v in result['RUN_CONFIGS'].items()]))))
    get_metrics(result, *args, **kwargs)
    display(ipywidgets.HTML('<h3>Comparisons (NewTokens Vs OldTokens)</h3>'))
    figs = []
    figs.extend(get_rank_comparison(df, *args, **kwargs))
    figs.extend(get_logit_comparison(df, *args, **kwargs))
    figs.extend(get_rank_diff_whiskers(df, *args, **kwargs))
    for fig in figs: fig.show()


In [None]:
%matplotlib inline

options = [str(file) for file in dfs.keys() if isinstance(file, str)]
options.sort()
file_selection: ipywidgets.Dropdown = ipywidgets.Dropdown(
    options=options + ['all'],
    value='all',
    description='File:',
    disabled=False,
)
width_slider: ipywidgets.IntSlider = ipywidgets.IntSlider(
    value=700,
    min=200,
    max=1000,
    step=1,
    description='Width:',
    orientation='horizontal',
    readout=True,
    readout_format='d'
)
height_slider: ipywidgets.IntSlider = ipywidgets.IntSlider(
    value=400,
    min=200,
    max=1000,
    step=1,
    description='Height:',
    orientation='horizontal',
    readout=True,
    readout_format='d'
)
execute_button = ipywidgets.Button(
    description='Execute',
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Execute',
)


out: ipywidgets.Output = ipywidgets.Output(layout={'border': '1px solid black'})
def on_button_clicked(_):
    # out.clear_output()
    items = [file_selection.value]
    if file_selection.value == 'all':
        items = options.copy()
    for analysis in items:
        df = dfs[analysis] # type: ignore
        result = results[analysis.replace('analysis', 'results')]  # type: ignore
        with out:
            # Filtering timestamp
            display(ipywidgets.HTML('<h1 style="text-align:center">MODEL: {}<h1>'.format(df['model'].min().replace('[NEW_TOKENS]', ''))))
            display(ipywidgets.HTML(f'<h3 style="text-align:center; margin-top: -10px">{analysis}</h3>'))

            run_analysis(df, result, analysis, width=width_slider.value, heigt=height_slider.value)
            display(ipywidgets.HTML('<div style="position: relative; width:100%; margin: 20px; height:4px;border-bottom: solid black 1px;border-top: solid black 1px;"></divß>'))

execute_button.on_click(on_button_clicked)

In [14]:
# Display all metrics for all runs
output = []
for version, result in results.items():
    output.append(get_metrics_aux(result, version))
    output[-1]['number_new_tokens'] = result['RUN_CONFIGS']['number_new_tokens']
    # Add the "Run Configs" to the dataframe
    # for key, val in result['RUN_CONFIGS'].items():
    #     output[-1][key] = val
tmp = pd.concat(output).reset_index(drop=True).sort_values(by=['model', 'model_type', 'number_new_tokens'])
tmp

Unnamed: 0,model,version,model_type,Fertility,Perplexity,SupergluePTPT,CalamePT,number_new_tokens
3,HuggingFaceTB/SmolLM2-135M,results@2025-07-19 21:38:00,BASELINE,2.819213,282.396896,0.014427,0.13632,1000
15,HuggingFaceTB/SmolLM2-135M,results@2025-07-19 23:13:55,BASELINE,2.819213,282.396896,0.014427,0.13632,5000
6,HuggingFaceTB/SmolLM2-135M,results@2025-07-20 00:53:18,BASELINE,2.819213,282.396896,0.014427,0.13632,7500
0,HuggingFaceTB/SmolLM2-135M,results@2025-07-19 17:02:00,BASELINE,2.819213,282.396896,0.014427,0.13632,10000
4,HuggingFaceTB/SmolLM2-135M,results@2025-07-19 21:38:00,INITIALIZED_NO_TRAINING,2.447623,1130.855321,0.014427,0.13632,1000
16,HuggingFaceTB/SmolLM2-135M,results@2025-07-19 23:13:55,INITIALIZED_NO_TRAINING,2.300432,1915.127496,0.01449,0.13632,5000
7,HuggingFaceTB/SmolLM2-135M,results@2025-07-20 00:53:18,INITIALIZED_NO_TRAINING,2.253377,2375.516706,0.014427,0.13632,7500
1,HuggingFaceTB/SmolLM2-135M,results@2025-07-19 17:02:00,INITIALIZED_NO_TRAINING,2.257951,2773.625544,0.01449,0.13632,10000
5,HuggingFaceTB/SmolLM2-135M,results@2025-07-19 21:38:00,INITIALIZED_WITH_TRAINING,2.447623,1203.891172,0.014427,0.13632,1000
17,HuggingFaceTB/SmolLM2-135M,results@2025-07-19 23:13:55,INITIALIZED_WITH_TRAINING,2.300432,2160.818526,0.01449,0.13632,5000


In [10]:
display(file_selection)
display(width_slider)
display(execute_button)
display(out)

Dropdown(description='File:', index=8, options=('analysis@2025-07-19 17:02:00', 'analysis@2025-07-19 19:51:12'…

IntSlider(value=700, description='Width:', max=1000, min=200)

Button(button_style='info', description='Execute', style=ButtonStyle(), tooltip='Execute')

Output(layout=Layout(border_bottom='1px solid black', border_left='1px solid black', border_right='1px solid b…